Cleaned seriea

In [1]:
import pandas as pd
import os

def clean_data(file_path, output_directory):
    # Load the data
    df = pd.read_csv(file_path)
    
    
    # Drop unnecessary columns
    if 'Trend' in df.columns and 'Unnamed: 11' in df.columns:
        df.drop(['Trend', 'Unnamed: 11'], axis=1, inplace=True)
    
    # Replace 'No match available' with NaN and drop rows where all values are NaN
    df.replace('No match available', pd.NA, inplace=True)
    df.dropna(how='all', inplace=True)
    
    # Assuming numeric columns need to be converted from object type if corrupted
    for col in ['PTS', 'P', 'W', 'D', 'L', 'GF', 'GA', 'GD']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Save the cleaned data back to a CSV file
    output_file_path = os.path.join(output_directory, os.path.basename(file_path))
    df.to_csv(output_file_path, index=False)
    print(f"Cleaned data saved to {output_file_path}")

def clean_all_files_in_directory(input_directory, output_directory):
    # Ensure output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Process each file in the directory
    for filename in os.listdir(input_directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(input_directory, filename)
            clean_data(file_path, output_directory)

# Example usage
input_directory = 'data/raw/seriea'  # Directory containing raw data files
output_directory = 'data/processed/seriea'  # Directory to store cleaned data files
clean_all_files_in_directory(input_directory, output_directory)

Cleaned data saved to data/processed/seriea\seriea-1986_87.csv
Cleaned data saved to data/processed/seriea\seriea-1987_88.csv
Cleaned data saved to data/processed/seriea\seriea-1988_89.csv
Cleaned data saved to data/processed/seriea\seriea-1989_90.csv
Cleaned data saved to data/processed/seriea\seriea-1990_91.csv
Cleaned data saved to data/processed/seriea\seriea-1991_92.csv
Cleaned data saved to data/processed/seriea\seriea-1992_93.csv
Cleaned data saved to data/processed/seriea\seriea-1993_94.csv
Cleaned data saved to data/processed/seriea\seriea-1994_95.csv
Cleaned data saved to data/processed/seriea\seriea-1995_96.csv
Cleaned data saved to data/processed/seriea\seriea-1996_97.csv
Cleaned data saved to data/processed/seriea\seriea-1997_98.csv
Cleaned data saved to data/processed/seriea\seriea-1998_99.csv
Cleaned data saved to data/processed/seriea\seriea-1999_00.csv
Cleaned data saved to data/processed/seriea\seriea-2000_01.csv
Cleaned data saved to data/processed/seriea\seriea-2001

Cleaned bundesliga

In [2]:
import pandas as pd
import os

def clean_data(file_path, output_directory):
    # Load the data
    df = pd.read_csv(file_path)
    

    df = df.loc[:, ~df.columns.str.contains('Unnamed')]
    # Drop unnecessary columns
   
    
    # Assuming numeric columns need to be converted from object type if corrupted
    for col in ['MatchesM', 'WonW', 'DrawD', 'LostL', 'GoalsG', '+/-', 'PointsP']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Save the cleaned data back to a CSV file
    output_file_path = os.path.join(output_directory, os.path.basename(file_path))
    df.to_csv(output_file_path, index=False)
    print(f"Cleaned data saved to {output_file_path}")

def clean_all_files_in_directory(input_directory, output_directory):
    # Ensure output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Process each file in the directory
    for filename in os.listdir(input_directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(input_directory, filename)
            clean_data(file_path, output_directory)

# Example usage
input_directory = 'data/raw/bundesliga'  # Directory containing raw data files
output_directory = 'data/processed/bundesliga' # Directory to store cleaned data files
clean_all_files_in_directory(input_directory, output_directory)

Cleaned data saved to data/processed/bundesliga\bundesliga-1963_1964.csv
Cleaned data saved to data/processed/bundesliga\bundesliga-1964_1965.csv
Cleaned data saved to data/processed/bundesliga\bundesliga-1965_1966.csv
Cleaned data saved to data/processed/bundesliga\bundesliga-1966_1967.csv
Cleaned data saved to data/processed/bundesliga\bundesliga-1967_1968.csv
Cleaned data saved to data/processed/bundesliga\bundesliga-1968_1969.csv
Cleaned data saved to data/processed/bundesliga\bundesliga-1969_1970.csv
Cleaned data saved to data/processed/bundesliga\bundesliga-1970_1971.csv
Cleaned data saved to data/processed/bundesliga\bundesliga-1971_1972.csv
Cleaned data saved to data/processed/bundesliga\bundesliga-1972_1973.csv
Cleaned data saved to data/processed/bundesliga\bundesliga-1973_1974.csv
Cleaned data saved to data/processed/bundesliga\bundesliga-1974_1975.csv
Cleaned data saved to data/processed/bundesliga\bundesliga-1975_1976.csv
Cleaned data saved to data/processed/bundesliga\bun

Cleaned ligue1

In [4]:
import pandas as pd
import os

def clean_data(df):
    """ Cleans a DataFrame by splitting combined columns into structured format. """
    # Split the 'Stats' column into multiple columns
    split_stats = df['Stats'].str.split(' ', expand=True)

    # Determine the number of elements in the split
    max_elements = split_stats.shape[1]

    # Check if the last column contains the sign ('+' or '-') and adjust the column names accordingly
    if max_elements == 9:
        split_columns = ['Points', 'Played', 'Wins', 'Draws', 'Losses', 'GF', 'GA', 'GD', 'Sign']
    elif max_elements == 8:
        split_columns = ['Points', 'Played', 'Wins', 'Draws', 'Losses', 'GF', 'GA', 'GD']
    else:
        raise ValueError("Unexpected number of columns after splitting 'Stats'")

    # Concatenate split columns with the original DataFrame
    df = pd.concat([df, split_stats], axis=1)

    # Rename the split columns
    df.columns = list(df.columns[:-max_elements]) + split_columns

    # Remove the original 'Stats' column and convert columns to numeric
    df.drop('Stats', axis=1, inplace=True)
    cols_to_convert = ['Points', 'Played', 'Wins', 'Draws', 'Losses', 'GF', 'GA', 'GD']
    df[cols_to_convert] = df[cols_to_convert].apply(pd.to_numeric)

    # Remove the 'Sign' column only if it is empty and exists in the DataFrame
    if 'Sign' in df.columns and df['Sign'].isnull().all():
        df.drop('Sign', axis=1, inplace=True)

    return df





def clean_all_files_in_directory(input_directory, output_directory):
    """ Processes each CSV file in the input directory, cleans the data, and saves it to the output directory. """
    for file in os.listdir(input_directory):
        file_path = os.path.join(input_directory, file)
        df = pd.read_csv(file_path, skiprows=0)  # Skip the first row
        df.columns = ['Position', 'Team', 'Stats']  # Assign column names after reading the CSV
        cleaned_df = clean_data(df)  # Clean the data using the clean_data function
        cleaned_file_path = os.path.join(output_directory, file)
        cleaned_df.to_csv(cleaned_file_path, index=False)
        print(f"Data cleaned and saved to {cleaned_file_path}")

# Specify directories
input_directory = 'data/raw/ligue1'
output_directory = 'data/processed/ligue1'
os.makedirs(output_directory, exist_ok=True)  # Ensure output directory exists

# Clean all files in the directory
clean_all_files_in_directory(input_directory, output_directory)


Data cleaned and saved to data/processed/ligue1\ligue1_1993-1994.csv
Data cleaned and saved to data/processed/ligue1\ligue1_1994-1995.csv
Data cleaned and saved to data/processed/ligue1\ligue1_1995-1996.csv
Data cleaned and saved to data/processed/ligue1\ligue1_1996-1997.csv
Data cleaned and saved to data/processed/ligue1\ligue1_1997-1998.csv
Data cleaned and saved to data/processed/ligue1\ligue1_1998-1999.csv
Data cleaned and saved to data/processed/ligue1\ligue1_1999-2000.csv
Data cleaned and saved to data/processed/ligue1\ligue1_2000-2001.csv
Data cleaned and saved to data/processed/ligue1\ligue1_2001-2002.csv
Data cleaned and saved to data/processed/ligue1\ligue1_2002-2003.csv
Data cleaned and saved to data/processed/ligue1\ligue1_2003-2004.csv
Data cleaned and saved to data/processed/ligue1\ligue1_2004-2005.csv
Data cleaned and saved to data/processed/ligue1\ligue1_2005-2006.csv
Data cleaned and saved to data/processed/ligue1\ligue1_2006-2007.csv
Data cleaned and saved to data/pro

Cleaned laliga

In [9]:
import pandas as pd
import os

def clean_data(df):
    """ Cleans a DataFrame by converting data types and handling any missing or extraneous information. """
    
    if 'Pts' in df.columns:
        # Renaming columns
        df.rename(columns={'Pts': 'Points', 'J': 'Pld', 'G': 'W', 'N': 'D', 'D': 'L', 'BP': 'GF', 'BC': 'GA', 'DIF': 'GD'}, inplace=True)
    elif 'Scores' in df.columns:
        # Dropping unnecessary column
        df.drop('Scores', axis=1, inplace=True)
        # Renaming columns
        df.columns = ['Position', 'Équipe', 'Points', 'Pld', 'W', 'D', 'L', 'GF', 'GA', 'GD']
    elif 'Position' in df.columns:
        # Renaming columns
        df.columns = ['Position', 'Équipe', 'Points', 'Pld', 'W', 'D', 'L', 'GF', 'GA', 'GD']
    
    numeric_cols = ['Points', 'Pld', 'W', 'D', 'L', 'GF', 'GA', 'GD']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

def clean_all_files_in_directory(input_directory, output_directory):
    """ Processes each CSV file in the input directory, cleans the data, and saves it to the output directory. """
    for file in os.listdir(input_directory):
        if file.endswith('.csv'):
            file_path = os.path.join(input_directory, file)
            try:
                df = pd.read_csv(file_path)
                df.replace("", float("NaN"), inplace=True)
                df.dropna(how='all', inplace=True)  
                cleaned_df = clean_data(df)
                cleaned_file_path = os.path.join(output_directory, file)
                cleaned_df.to_csv(cleaned_file_path, index=False)
                print(f"Data cleaned and saved to {cleaned_file_path}")
            except Exception as e:
                print(f"Error processing file {file}: {e}")

# Specify directories
input_directory = 'data/raw/laliga'
output_directory = 'data/processed/laliga'
os.makedirs(output_directory, exist_ok=True)  # Ensure output directory exists

# Clean all files in the directory
clean_all_files_in_directory(input_directory, output_directory)


Data cleaned and saved to data/processed/laliga\laliga-2003_2004.csv
Data cleaned and saved to data/processed/laliga\laliga-2004_2005.csv
Data cleaned and saved to data/processed/laliga\laliga-2005_2006.csv
Data cleaned and saved to data/processed/laliga\laliga-2006_2007.csv
Data cleaned and saved to data/processed/laliga\laliga-2007_2008.csv
Data cleaned and saved to data/processed/laliga\laliga-2008_2009.csv
Data cleaned and saved to data/processed/laliga\laliga-2009_2010.csv
Data cleaned and saved to data/processed/laliga\laliga-2010_2011.csv
Data cleaned and saved to data/processed/laliga\laliga-2011_2012.csv
Data cleaned and saved to data/processed/laliga\laliga-2012_2013.csv
Data cleaned and saved to data/processed/laliga\laliga-2013_2014.csv
Data cleaned and saved to data/processed/laliga\laliga-2014_2015.csv
Data cleaned and saved to data/processed/laliga\laliga-2015_2016.csv
Data cleaned and saved to data/processed/laliga\laliga-2016_2017.csv
Data cleaned and saved to data/pro

Cleaned premierleague


In [1]:
import pandas as pd
import os

# Specify directories
input_directory = 'data/raw/premierleague'
output_directory = 'data/processed/premierleague'
os.makedirs(output_directory, exist_ok=True)  # Ensure output directory exists

# Function to clean all files in the directory
def clean_all_files_in_directory(input_dir, output_dir):
    for subdir, dirs, files in os.walk(input_dir):
        for file in files:
            file_path = os.path.join(subdir, file)
            df = pd.read_csv(file_path, sep=',', header=0)
           
            # Remove unnecessary columns
            if 'Form' in df.columns:
                df.drop('Form', axis=1, inplace=True)
                
            df = df[['Position', 'Club', 'Played Pl', 'Won W', 'Drawn D', 'Lost L', 'GF', 'GA', 'GD', 'Points Pts']]
            # Rename the columns
            df.columns = ['Position', 'Club', 'Played', 'Won', 'Drawn', 'Lost', 'GF', 'GA', 'GD', 'Points']
            # Remove "Previous Position" from the 'Position' column
            df['Position'] = df['Position'].str.replace('Previous Position ', '')
            # Extract the team name from 'Club' column
            df['Club'] = df['Club'].str.split(' ').str[1:].str.join(' ')
            # Replace label.latestresult with the actual latest result
            df['Club'] = df['Club'].str.replace('label.latestresult: ', '')
            # Keep only even rows
            df = df.iloc[::2]
            # Save the cleaned data to a new CSV file
            output_filepath = os.path.join(output_dir, file)
            df.to_csv(output_filepath, index=False)
            print(f"Data cleaned and saved to {output_filepath}")

# Clean all files in the directory
clean_all_files_in_directory(input_directory, output_directory)


Data cleaned and saved to data/processed/premierleague\premierleague-1992_93.csv
Data cleaned and saved to data/processed/premierleague\premierleague-1993_94.csv
Data cleaned and saved to data/processed/premierleague\premierleague-1994_95.csv
Data cleaned and saved to data/processed/premierleague\premierleague-1995_96.csv
Data cleaned and saved to data/processed/premierleague\premierleague-1996_97.csv
Data cleaned and saved to data/processed/premierleague\premierleague-1997_98.csv
Data cleaned and saved to data/processed/premierleague\premierleague-1998_99.csv
Data cleaned and saved to data/processed/premierleague\premierleague-1999_00.csv
Data cleaned and saved to data/processed/premierleague\premierleague-2000_01.csv
Data cleaned and saved to data/processed/premierleague\premierleague-2001_02.csv
Data cleaned and saved to data/processed/premierleague\premierleague-2002_03.csv
Data cleaned and saved to data/processed/premierleague\premierleague-2003_04.csv
Data cleaned and saved to da