In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
def transform_age(age):
    if isinstance(age, str):
        return age.split('-')[0]
    return age
def Cleaning_def (df):
    # Removing Days
    df['age'] = df['age'].apply(transform_age)
    # Removing players from imaginary countries
    df = df[df['nationality'] !='ISR'].copy()
    return df

In [3]:
def fill_age_nat(pass_df):
    pass_df.at[32, 'nationality'] = 'MEX' # Abraham Villegas
    pass_df.at[144, 'nationality'] = 'SPA' # Alejandro Lozano
    pass_df.at[144, 'age'] = 19
    pass_df.at[179, 'nationality'] = 'CPV' # Alessio da Cruz
    pass_df.at[529, 'nationality'] = 'ENG' # Ben Hatton
    pass_df.at[529, 'age'] = 18
    pass_df.at[576, 'age'] = 23 # Bobby Thomas
    pass_df.at[848, 'age'] = 20 # Connor O'Riordan
    pass_df.at[1305, 'nationality'] = 'TUR' # Eren Öztürk
    pass_df.at[1618, 'nationality'] = 'ENG' # Gerard Buabo
    pass_df.at[1618, 'age'] = 19
    pass_df.at[1987, 'nationality'] = 'ENG' # James Wilson
    pass_df.at[1987, 'age'] = 17
    pass_df.at[2303, 'nationality'] = 'ENG' # Joseph James
    pass_df.at[2303, 'age'] = 17
    pass_df.at[2366, 'nationality'] = 'SPA' # José Menor
    pass_df.at[2366, 'age'] = 21
    pass_df.at[2409, 'nationality'] = 'SPA' # Juan Sabater
    pass_df.at[2409, 'age'] = 17
    pass_df.at[3559, 'nationality'] = 'SPA' # Pablo Hernández
    pass_df.at[3559, 'age'] = 23
    pass_df.at[3641, 'nationality'] = 'IRL' # Peter Kioso
    pass_df.at[4166, 'nationality'] = 'FRA' # Stone Mambo
    pass_df.at[4333, 'nationality'] = 'FRA' # Tristan Muyumba
    return pass_df

In [4]:
def combine_csv_files(directory):
    # Initialize an empty list to hold DataFrames
    dfs = []

    # Loop through all files in the directory
    for filename in os.listdir(directory):
        # Check if the file is a CSV file
        if filename.endswith(".csv"):
            # Create the full file path
            file_path = os.path.join(directory, filename)
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            # Append the DataFrame to the list
            dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    # Apply the Cleaning_def function
    combined_df = Cleaning_def(combined_df)
    combined_df = combined_df.sort_values(by='player')
    combined_df.reset_index(drop=True, inplace=True)

    return combined_df

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
input_directory = "/kaggle/input/to-clean/Defense"
# Combine CSV files and clean the DataFrame
def_df = combine_csv_files(input_directory)
# values for this column are 0 beacause there are no tackles to be counted so 0/0 is NaN therefore we fill with 0
def_df.fillna({'challenge_tackles_pct': 0}, inplace=True)
def_df.isna().sum()

player                    0
nationality              14
position                  0
age                      10
club                      0
league                    0
minutes_90s               0
tackles                   8
tackles_won               0
tackles_def_3rd           8
tackles_mid_3rd           8
tackles_att_3rd           8
challenge_tackles         8
challenges                8
challenge_tackles_pct     0
challenges_lost           8
blocks                    8
blocked_shots             8
blocked_passes            8
interceptions             0
tackles_interceptions     8
clearances                8
errors                    8
dtype: int64

- From the isna results we can get an idea of what we're going to do.
- We have 14 nationality and 10 age to fix & 8 players to be removed
- Unify the age data type to an int

In [7]:
# Players to be removed due to lack of data
def_df[def_df['tackles'].isna()]

Unnamed: 0,player,nationality,position,age,club,league,minutes_90s,tackles,tackles_won,tackles_def_3rd,tackles_mid_3rd,tackles_att_3rd,challenge_tackles,challenges,challenge_tackles_pct,challenges_lost,blocks,blocked_shots,blocked_passes,interceptions,tackles_interceptions,clearances,errors
303,Andrea Cecchetto,ITA,DF,20.0,Cittadella,Serie B,0.5,,1,,,,,,0.0,,,,,1,,,
495,Azeem Abdulai,SCO,MF,20.0,Swansea City,EFL,1.5,,4,,,,,,0.0,,,,,5,,,
1545,Frederick Issaka,WAL,FW,17.0,Plymouth Argyle,EFL,0.4,,2,,,,,,0.0,,,,,0,,,
1824,Iker Gil,ESP,FW,19.0,Huesca,Segunda Division,0.6,,0,,,,,,0.0,,,,,0,,,
2438,Julien Benhaim,FRA,MF,26.0,Ajaccio,Ligue 2,5.8,,10,,,,,,0.0,,,,,15,,,
2482,Jérémy De Bessa,FRA,DF,17.0,Ajaccio,Ligue 2,0.1,,0,,,,,,0.0,,,,,0,,,
2621,Kristoffer Lund,USA,DF,21.0,Palermo,Serie B,27.8,,43,,,,,,0.0,,,,,28,,,
3278,Miquel Nelom,NED,DF,32.0,León,Liga MX,1.7,,1,,,,,,0.0,,,,,3,,,


In [8]:
# Remove rows where 'tackles' column has NaN values
def_df = def_df[~def_df['tackles'].isna()]
# Reset the index of the DataFrame
def_df.reset_index(drop=True, inplace=True)
def_df.isna().sum()

player                    0
nationality              14
position                  0
age                      10
club                      0
league                    0
minutes_90s               0
tackles                   0
tackles_won               0
tackles_def_3rd           0
tackles_mid_3rd           0
tackles_att_3rd           0
challenge_tackles         0
challenges                0
challenge_tackles_pct     0
challenges_lost           0
blocks                    0
blocked_shots             0
blocked_passes            0
interceptions             0
tackles_interceptions     0
clearances                0
errors                    0
dtype: int64

In [9]:
# Filling Age and Nationality
def_df[(def_df['age'].isna()) | (def_df['nationality'].isna())]

Unnamed: 0,player,nationality,position,age,club,league,minutes_90s,tackles,tackles_won,tackles_def_3rd,tackles_mid_3rd,tackles_att_3rd,challenge_tackles,challenges,challenge_tackles_pct,challenges_lost,blocks,blocked_shots,blocked_passes,interceptions,tackles_interceptions,clearances,errors
32,Abraham Villegas,,DF,20.0,Toluca,Liga MX,3.7,19.0,13,15.0,4.0,0.0,8.0,16.0,50.0,8.0,5.0,2.0,3.0,1,20.0,5.0,0.0
144,Alejandro Lozano,,MF,,Sporting Gijón,Segunda Division,0.8,3.0,2,2.0,1.0,0.0,2.0,6.0,33.3,4.0,1.0,0.0,1.0,1,4.0,0.0,0.0
179,Alessio da Cruz,,FW,26.0,FeralpiSalò,Serie B,0.2,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
529,Ben Hatton,,MF,,Rotherham Utd,EFL,0.1,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
576,Bobby Thomas,ENG,DF,,Coventry City,EFL,41.8,63.0,41,44.0,17.0,2.0,40.0,63.0,63.5,23.0,80.0,54.0,26.0,54,117.0,190.0,3.0
848,Connor O'Riordan,IRL,DF,,Blackburn,EFL,0.5,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,1.0,0.0
1305,Eren Öztürk,,MF,19.0,Karlsruher,2. Bundesliga,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
1618,Gerard Buabo,,FW,,Ipswich Town,EFL,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
1987,James Wilson,,DF,,Middlesbrough,EFL,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
2303,Joseph James,,DF,,Bristol City,EFL,0.9,2.0,0,0.0,2.0,0.0,2.0,3.0,66.7,1.0,2.0,0.0,2.0,1,3.0,1.0,0.0


In [10]:
def_df = fill_age_nat(def_df)
def_df['age'] = def_df['age'].astype(int)
len(def_df),def_df.isna().sum().sum()

(4625, 0)

In [11]:
def_df.to_csv('DefenceLeagues2_23_24.csv', index=False)

In [12]:
input_directory = "/kaggle/input/to-clean/GCA"
gca_df = combine_csv_files(input_directory)
gca_df.isna().sum()

player              0
nationality        14
position            0
age                10
club                0
league              0
minutes_90s         0
sca                 8
sca_per90           8
sca_passes_live     8
sca_passes_dead     8
sca_take_ons        8
sca_shots           8
sca_fouled          8
sca_defense         8
gca                 8
gca_per90           8
gca_passes_live     8
gca_passes_dead     8
gca_take_ons        8
gca_shots           8
gca_fouled          8
gca_defense         8
dtype: int64

In [13]:
# Players to be removed due to lack of data
gca_df[gca_df['sca'].isna()]

Unnamed: 0,player,nationality,position,age,club,league,minutes_90s,sca,sca_per90,sca_passes_live,sca_passes_dead,sca_take_ons,sca_shots,sca_fouled,sca_defense,gca,gca_per90,gca_passes_live,gca_passes_dead,gca_take_ons,gca_shots,gca_fouled,gca_defense
303,Andrea Cecchetto,ITA,DF,20.0,Cittadella,Serie B,0.5,,,,,,,,,,,,,,,,
495,Azeem Abdulai,SCO,MF,20.0,Swansea City,EFL,1.5,,,,,,,,,,,,,,,,
1545,Frederick Issaka,WAL,FW,17.0,Plymouth Argyle,EFL,0.4,,,,,,,,,,,,,,,,
1824,Iker Gil,ESP,FW,19.0,Huesca,Segunda Division,0.6,,,,,,,,,,,,,,,,
2438,Julien Benhaim,FRA,MF,26.0,Ajaccio,Ligue 2,5.8,,,,,,,,,,,,,,,,
2482,Jérémy De Bessa,FRA,DF,17.0,Ajaccio,Ligue 2,0.1,,,,,,,,,,,,,,,,
2621,Kristoffer Lund,USA,DF,21.0,Palermo,Serie B,27.8,,,,,,,,,,,,,,,,
3278,Miquel Nelom,NED,DF,32.0,León,Liga MX,1.7,,,,,,,,,,,,,,,,


In [14]:
# Remove rows where 'tackles' column has NaN values
gca_df = gca_df[~gca_df['sca'].isna()]

gca_df.reset_index(drop=True, inplace=True)
gca_df.isna().sum()

player              0
nationality        14
position            0
age                10
club                0
league              0
minutes_90s         0
sca                 0
sca_per90           0
sca_passes_live     0
sca_passes_dead     0
sca_take_ons        0
sca_shots           0
sca_fouled          0
sca_defense         0
gca                 0
gca_per90           0
gca_passes_live     0
gca_passes_dead     0
gca_take_ons        0
gca_shots           0
gca_fouled          0
gca_defense         0
dtype: int64

In [15]:
# Filling Age and Nationality
gca_df[(gca_df['age'].isna()) | (gca_df['nationality'].isna())]

Unnamed: 0,player,nationality,position,age,club,league,minutes_90s,sca,sca_per90,sca_passes_live,sca_passes_dead,sca_take_ons,sca_shots,sca_fouled,sca_defense,gca,gca_per90,gca_passes_live,gca_passes_dead,gca_take_ons,gca_shots,gca_fouled,gca_defense
32,Abraham Villegas,,DF,20.0,Toluca,Liga MX,3.7,5.0,1.37,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144,Alejandro Lozano,,MF,,Sporting Gijón,Segunda Division,0.8,4.0,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
179,Alessio da Cruz,,FW,26.0,FeralpiSalò,Serie B,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
529,Ben Hatton,,MF,,Rotherham Utd,EFL,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
576,Bobby Thomas,ENG,DF,,Coventry City,EFL,41.8,30.0,0.72,23.0,1.0,0.0,5.0,0.0,1.0,1.0,0.02,0.0,0.0,0.0,1.0,0.0,0.0
848,Connor O'Riordan,IRL,DF,,Blackburn,EFL,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1305,Eren Öztürk,,MF,19.0,Karlsruher,2. Bundesliga,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1618,Gerard Buabo,,FW,,Ipswich Town,EFL,0.0,1.0,22.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1987,James Wilson,,DF,,Middlesbrough,EFL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2303,Joseph James,,DF,,Bristol City,EFL,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
gca_df = fill_age_nat(gca_df)
gca_df['age'] = gca_df['age'].astype(int)
len(gca_df), gca_df.isna().sum().sum()

(4625, 0)

In [17]:
gca_df.to_csv('GCALeagues2_23_24.csv', index=False)

In [18]:
input_directory = "/kaggle/input/to-clean/Misc"
misc_df = combine_csv_files(input_directory)
misc_df.fillna({'aerials_won_pct': 0}, inplace=True)
misc_df.isna().sum()

player               0
nationality         14
position             0
age                 10
club                 0
league               0
minutes_90s          0
cards_yellow         0
cards_red            0
cards_yellow_red     0
fouls                0
fouled               0
offsides             0
crosses              0
interceptions        0
tackles_won          0
pens_won             8
pens_conceded        8
own_goals            0
ball_recoveries      8
aerials_won          8
aerials_lost         8
aerials_won_pct      0
dtype: int64

In [19]:
# Remove rows where 'tackles' column has NaN values
misc_df = misc_df[~misc_df['pens_won'].isna()]
misc_df.reset_index(drop=True, inplace=True)
misc_df.isna().sum()

player               0
nationality         14
position             0
age                 10
club                 0
league               0
minutes_90s          0
cards_yellow         0
cards_red            0
cards_yellow_red     0
fouls                0
fouled               0
offsides             0
crosses              0
interceptions        0
tackles_won          0
pens_won             0
pens_conceded        0
own_goals            0
ball_recoveries      0
aerials_won          0
aerials_lost         0
aerials_won_pct      0
dtype: int64

In [20]:
# Filling Age and Nationality
misc_df[(misc_df['age'].isna()) | (misc_df['nationality'].isna())]

Unnamed: 0,player,nationality,position,age,club,league,minutes_90s,cards_yellow,cards_red,cards_yellow_red,fouls,fouled,offsides,crosses,interceptions,tackles_won,pens_won,pens_conceded,own_goals,ball_recoveries,aerials_won,aerials_lost,aerials_won_pct
32,Abraham Villegas,,DF,20.0,Toluca,Liga MX,3.7,1,0,0,4,1,0,5,1,13,0.0,0.0,0,10.0,3.0,4.0,42.9
144,Alejandro Lozano,,MF,,Sporting Gijón,Segunda Division,0.8,1,0,0,2,0,1,2,1,2,0.0,0.0,0,6.0,0.0,1.0,0.0
179,Alessio da Cruz,,FW,26.0,FeralpiSalò,Serie B,0.2,0,0,0,2,0,0,1,0,0,0.0,0.0,0,0.0,0.0,1.0,0.0
529,Ben Hatton,,MF,,Rotherham Utd,EFL,0.1,0,0,0,0,0,0,0,0,0,0.0,0.0,0,1.0,0.0,0.0,0.0
576,Bobby Thomas,ENG,DF,,Coventry City,EFL,41.8,9,0,0,39,40,2,9,54,41,0.0,1.0,2,223.0,102.0,71.0,59.0
848,Connor O'Riordan,IRL,DF,,Blackburn,EFL,0.5,0,0,0,0,0,0,0,1,0,0.0,0.0,0,1.0,2.0,1.0,66.7
1305,Eren Öztürk,,MF,19.0,Karlsruher,2. Bundesliga,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,2.0,0.0,100.0
1618,Gerard Buabo,,FW,,Ipswich Town,EFL,0.0,0,0,0,0,1,0,0,0,0,0.0,0.0,0,1.0,0.0,1.0,0.0
1987,James Wilson,,DF,,Middlesbrough,EFL,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,1.0,0.0,0.0,0.0
2303,Joseph James,,DF,,Bristol City,EFL,0.9,0,0,0,0,0,0,1,1,0,0.0,0.0,0,6.0,2.0,4.0,33.3


In [21]:
misc_df = fill_age_nat(misc_df)
misc_df['age'] = misc_df['age'].astype(int)
len(misc_df), misc_df.isna().sum().sum()

(4625, 0)

In [22]:
misc_df.to_csv('Misc_Leagues2_23_24.csv', index=False)

In [23]:
input_directory = "/kaggle/input/to-clean/Pass Types"
pass_t_df = combine_csv_files(input_directory)
pass_t_df.isna().sum()

player                    0
nationality              14
position                  0
age                      10
club                      0
league                    0
minutes_90s               0
passes                    8
passes_live               8
passes_dead               8
passes_free_kicks         8
through_balls             8
passes_switches           8
crosses                   0
throw_ins                 8
corner_kicks              8
corner_kicks_in           8
corner_kicks_out          8
corner_kicks_straight     8
passes_completed          8
passes_offsides           8
passes_blocked            8
dtype: int64

In [24]:
# Remove rows where 'tackles' column has NaN values
pass_t_df = pass_t_df[~pass_t_df['passes'].isna()]

# Reset the index of the DataFrame
pass_t_df.reset_index(drop=True, inplace=True)
pass_t_df.isna().sum()

player                    0
nationality              14
position                  0
age                      10
club                      0
league                    0
minutes_90s               0
passes                    0
passes_live               0
passes_dead               0
passes_free_kicks         0
through_balls             0
passes_switches           0
crosses                   0
throw_ins                 0
corner_kicks              0
corner_kicks_in           0
corner_kicks_out          0
corner_kicks_straight     0
passes_completed          0
passes_offsides           0
passes_blocked            0
dtype: int64

In [25]:
pass_t_df[(pass_t_df['age'].isna()) | (pass_t_df['nationality'].isna())]

Unnamed: 0,player,nationality,position,age,club,league,minutes_90s,passes,passes_live,passes_dead,passes_free_kicks,through_balls,passes_switches,crosses,throw_ins,corner_kicks,corner_kicks_in,corner_kicks_out,corner_kicks_straight,passes_completed,passes_offsides,passes_blocked
32,Abraham Villegas,,DF,20.0,Toluca,Liga MX,3.7,185.0,156.0,29.0,3.0,0.0,1.0,5,26.0,0.0,0.0,0.0,0.0,149.0,0.0,5.0
144,Alejandro Lozano,,MF,,Sporting Gijón,Segunda Division,0.8,37.0,36.0,1.0,0.0,0.0,0.0,2,1.0,0.0,0.0,0.0,0.0,29.0,0.0,0.0
179,Alessio da Cruz,,FW,26.0,FeralpiSalò,Serie B,0.2,4.0,4.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
529,Ben Hatton,,MF,,Rotherham Utd,EFL,0.1,5.0,5.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
576,Bobby Thomas,ENG,DF,,Coventry City,EFL,41.8,2495.0,2384.0,107.0,81.0,5.0,11.0,9,26.0,0.0,0.0,0.0,0.0,2004.0,4.0,24.0
848,Connor O'Riordan,IRL,DF,,Blackburn,EFL,0.5,16.0,16.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0
1305,Eren Öztürk,,MF,19.0,Karlsruher,2. Bundesliga,0.0,6.0,6.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0
1618,Gerard Buabo,,FW,,Ipswich Town,EFL,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1987,James Wilson,,DF,,Middlesbrough,EFL,0.0,6.0,6.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0
2303,Joseph James,,DF,,Bristol City,EFL,0.9,39.0,27.0,12.0,0.0,1.0,0.0,1,12.0,0.0,0.0,0.0,0.0,21.0,0.0,3.0


In [26]:
pass_t_df = fill_age_nat(pass_t_df)
pass_t_df['age'] = pass_t_df['age'].astype(int)
len(pass_t_df), pass_t_df.isna().sum().sum()

(4625, 0)

In [27]:
pass_t_df.to_csv('Pass_Types_Leagues2_23_24.csv', index=False)

In [28]:
input_directory = "/kaggle/input/to-clean/Passing"
pass_df = combine_csv_files(input_directory)
pass_df.fillna({'passes_pct': 0}, inplace=True)
pass_df.fillna({'passes_pct_short': 0}, inplace=True)
pass_df.fillna({'passes_pct_medium': 0}, inplace=True)
pass_df.fillna({'passes_pct_long': 0}, inplace=True)
pass_df.isna().sum()

player                          0
nationality                    14
position                        0
age                            10
club                            0
league                          0
minutes_90s                     0
passes_completed                8
passes                          8
passes_pct                      0
passes_total_distance           8
passes_progressive_distance     8
passes_completed_short          8
passes_short                    8
passes_pct_short                0
passes_completed_medium         8
passes_medium                   8
passes_pct_medium               0
passes_completed_long           8
passes_long                     8
passes_pct_long                 0
assists                         0
xg_assist                       8
pass_xa                         8
xg_assist_net                   8
assisted_shots                  8
passes_into_final_third         8
passes_into_penalty_area        8
crosses_into_penalty_area       8
progressive_pa

In [30]:
pass_df = pass_df[~pass_df['passes'].isna()]
pass_df.reset_index(drop=True, inplace=True)

In [31]:
# Filling Age and Nationality
pass_df[(pass_df['age'].isna()) | (pass_df['nationality'].isna())]

Unnamed: 0,player,nationality,position,age,club,league,minutes_90s,passes_completed,passes,passes_pct,passes_total_distance,passes_progressive_distance,passes_completed_short,passes_short,passes_pct_short,passes_completed_medium,passes_medium,passes_pct_medium,passes_completed_long,passes_long,passes_pct_long,assists,xg_assist,pass_xa,xg_assist_net,assisted_shots,passes_into_final_third,passes_into_penalty_area,crosses_into_penalty_area,progressive_passes
32,Abraham Villegas,,DF,20.0,Toluca,Liga MX,3.7,149.0,185.0,80.5,2314.0,582.0,79.0,88.0,89.8,62.0,76.0,81.6,6.0,12.0,50.0,0,0.1,0.1,-0.1,2.0,9.0,1.0,0.0,11.0
144,Alejandro Lozano,,MF,,Sporting Gijón,Segunda Division,0.8,29.0,37.0,78.4,453.0,132.0,20.0,24.0,83.3,8.0,10.0,80.0,1.0,3.0,33.3,0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,6.0
179,Alessio da Cruz,,FW,26.0,FeralpiSalò,Serie B,0.2,3.0,4.0,75.0,52.0,12.0,1.0,1.0,100.0,2.0,3.0,66.7,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
529,Ben Hatton,,MF,,Rotherham Utd,EFL,0.1,5.0,5.0,100.0,69.0,6.0,3.0,3.0,100.0,2.0,2.0,100.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
576,Bobby Thomas,ENG,DF,,Coventry City,EFL,41.8,2004.0,2495.0,80.3,39451.0,16465.0,621.0,717.0,86.6,1154.0,1292.0,89.3,209.0,425.0,49.2,0,0.7,1.6,-0.7,9.0,160.0,15.0,2.0,162.0
848,Connor O'Riordan,IRL,DF,,Blackburn,EFL,0.5,15.0,16.0,93.8,287.0,135.0,6.0,6.0,100.0,7.0,7.0,100.0,2.0,2.0,100.0,0,0.0,0.1,0.0,0.0,0.0,1.0,0.0,2.0
1305,Eren Öztürk,,MF,19.0,Karlsruher,2. Bundesliga,0.0,4.0,6.0,66.7,74.0,26.0,0.0,1.0,0.0,2.0,2.0,100.0,1.0,1.0,100.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1618,Gerard Buabo,,FW,,Ipswich Town,EFL,0.0,1.0,2.0,50.0,10.0,0.0,1.0,2.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1987,James Wilson,,DF,,Middlesbrough,EFL,0.0,6.0,6.0,100.0,102.0,50.0,3.0,3.0,100.0,3.0,3.0,100.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2303,Joseph James,,DF,,Bristol City,EFL,0.9,21.0,39.0,53.8,356.0,63.0,9.0,13.0,69.2,12.0,20.0,60.0,0.0,2.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [32]:
pass_df = fill_age_nat(pass_df)
pass_df['age'] = pass_df['age'].astype(int)
len(pass_df), pass_df.isna().sum().sum()

(4625, 0)

In [33]:
pass_df.to_csv('Passing_Leagues2_23_24.csv', index=False)

In [34]:
input_directory = "/kaggle/input/to-clean/Possession"
poss_df = combine_csv_files(input_directory)
poss_df.fillna({'take_ons_won_pct': 0}, inplace=True)
poss_df.fillna({'take_ons_tackled_pct': 0}, inplace=True)
poss_df.isna().sum()

player                           0
nationality                     14
position                         0
age                             10
club                             0
league                           0
minutes_90s                      0
touches                          8
touches_def_pen_area             8
touches_def_3rd                  8
touches_mid_3rd                  8
touches_att_3rd                  8
touches_att_pen_area             8
touches_live_ball                8
take_ons                         8
take_ons_won                     8
take_ons_won_pct                 0
take_ons_tackled                 8
take_ons_tackled_pct             0
carries                          8
carries_distance                 8
carries_progressive_distance     8
progressive_carries              8
carries_into_final_third         8
carries_into_penalty_area        8
miscontrols                      8
dispossessed                     8
passes_received                  8
progressive_passes_r

In [35]:
poss_df = poss_df[~poss_df['take_ons'].isna()]
poss_df.reset_index(drop=True, inplace=True)

In [36]:
# Filling Age and Nationality
poss_df[(poss_df['age'].isna()) | (poss_df['nationality'].isna())]

Unnamed: 0,player,nationality,position,age,club,league,minutes_90s,touches,touches_def_pen_area,touches_def_3rd,touches_mid_3rd,touches_att_3rd,touches_att_pen_area,touches_live_ball,take_ons,take_ons_won,take_ons_won_pct,take_ons_tackled,take_ons_tackled_pct,carries,carries_distance,carries_progressive_distance,progressive_carries,carries_into_final_third,carries_into_penalty_area,miscontrols,dispossessed,passes_received,progressive_passes_received
32,Abraham Villegas,,DF,20.0,Toluca,Liga MX,3.7,238.0,10.0,73.0,116.0,49.0,7.0,238.0,3.0,0.0,0.0,3.0,100.0,122.0,605.0,284.0,6.0,3.0,2.0,4.0,1.0,139.0,19.0
144,Alejandro Lozano,,MF,,Sporting Gijón,Segunda Division,0.8,55.0,0.0,7.0,28.0,20.0,5.0,55.0,4.0,1.0,25.0,3.0,75.0,39.0,177.0,76.0,1.0,6.0,0.0,3.0,2.0,40.0,12.0
179,Alessio da Cruz,,FW,26.0,FeralpiSalò,Serie B,0.2,6.0,0.0,0.0,3.0,3.0,1.0,6.0,1.0,0.0,0.0,1.0,100.0,5.0,71.0,23.0,2.0,1.0,1.0,0.0,0.0,5.0,3.0
529,Ben Hatton,,MF,,Rotherham Utd,EFL,0.1,7.0,0.0,1.0,4.0,2.0,0.0,7.0,1.0,0.0,0.0,1.0,100.0,3.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0
576,Bobby Thomas,ENG,DF,,Coventry City,EFL,41.8,2968.0,396.0,1606.0,1223.0,157.0,52.0,2968.0,15.0,9.0,60.0,6.0,40.0,1656.0,8169.0,4282.0,29.0,23.0,0.0,39.0,6.0,1903.0,15.0
848,Connor O'Riordan,IRL,DF,,Blackburn,EFL,0.5,18.0,2.0,13.0,3.0,2.0,0.0,18.0,1.0,1.0,100.0,0.0,0.0,7.0,27.0,9.0,0.0,0.0,0.0,0.0,0.0,14.0,1.0
1305,Eren Öztürk,,MF,19.0,Karlsruher,2. Bundesliga,0.0,6.0,0.0,1.0,5.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,2.0,13.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
1618,Gerard Buabo,,FW,,Ipswich Town,EFL,0.0,2.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
1987,James Wilson,,DF,,Middlesbrough,EFL,0.0,7.0,3.0,6.0,1.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,4.0,30.0,10.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0
2303,Joseph James,,DF,,Bristol City,EFL,0.9,51.0,2.0,21.0,22.0,8.0,0.0,51.0,0.0,0.0,0.0,0.0,0.0,19.0,44.0,14.0,0.0,2.0,0.0,1.0,0.0,19.0,1.0


In [37]:
poss_df = fill_age_nat(poss_df)
poss_df = poss_df.copy()
poss_df['age'] = poss_df['age'].astype(int)
len(poss_df), poss_df.isna().sum().sum()

(4625, 0)

In [38]:
poss_df.to_csv('Possession_Leagues2_23_24.csv', index=False)

In [39]:
input_directory = "/kaggle/input/to-clean/Shooting"
shot_df = combine_csv_files(input_directory)
shot_df.isna().sum()

player                         0
nationality                   14
position                       0
age                           10
club                           0
league                         0
minutes_90s                    0
goals                          0
shots                          0
shots_on_target                0
shots_on_target_pct          804
shots_per90                    0
shots_on_target_per90          0
goals_per_shot               804
goals_per_shot_on_target    1403
average_shot_distance        808
shots_free_kicks               8
pens_made                      0
pens_att                       0
xg                             8
npxg                           8
npxg_per_shot                808
xg_net                         8
npxg_net                       8
dtype: int64

In [40]:
shot_df[shot_df['goals_per_shot'].isna()].sort_values(by='goals',ascending=False)

Unnamed: 0,player,nationality,position,age,club,league,minutes_90s,goals,shots,shots_on_target,shots_on_target_pct,shots_per90,shots_on_target_per90,goals_per_shot,goals_per_shot_on_target,average_shot_distance,shots_free_kicks,pens_made,pens_att,xg,npxg,npxg_per_shot,xg_net,npxg_net
4251,Tiago Volpi,BRA,GK,32,Toluca,Liga MX,34.0,9,0,0,,0.0,0.0,,,,0.0,9,11,8.7,0.0,,0.3,0.0
2803,Luca Vido,ITA,"FW,MF",26,Reggiana,Serie B,0.4,1,0,0,,0.0,0.0,,,,0.0,1,1,0.8,0.0,,0.2,0.0
4297,Tom Lebeau,FRA,MF,25,US Concarneau,Ligue 2,1.0,1,0,0,,0.0,0.0,,,,0.0,1,1,0.8,0.0,,0.2,0.0
3,Aaron Drewe,ENG,DF,22.0,QPR,EFL,1.0,0,0,0,,0.0,0.0,,,,0.0,0,0,0.0,0.0,,0.0,0.0
3127,Matteo Stoppa,ITA,FW,22,Sampdoria,Serie B,0.1,0,0,0,,0.0,0.0,,,,0.0,0,0,0.0,0.0,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655,Gino Vivi,CRC,"MF,DF",22,LA Galaxy,MLS,0.9,0,0,0,,0.0,0.0,,,,0.0,0,0,0.0,0.0,,0.0,0.0
1662,Giovanni Aguilar,USA,MF,24,Vancouver W'caps,MLS,0.1,0,0,0,,0.0,0.0,,,,0.0,0,0,0.0,0.0,,0.0,0.0
1687,Greg Leigh,JAM,DF,28.0,Ipswich Town,EFL,0.0,0,0,0,,0.0,0.0,,,,0.0,0,0,0.0,0.0,,0.0,0.0
1688,Greg Ranjitsingh,TRI,GK,29,Toronto FC,MLS,4.0,0,0,0,,0.0,0.0,,,,0.0,0,0,0.0,0.0,,0.0,0.0


In [70]:
# Tiago Volpi is a GK however he's the penalty taker for Toluca
# 9 goals from 11 penalties taken, penalties aren't counted in shots or shots on target

In [41]:
# Remove rows where 'xg' column has NaN values
shot_df = shot_df[~shot_df['xg'].isna()]
shot_df.reset_index(drop=True, inplace=True)

In [42]:
shot_df = fill_age_nat(shot_df)
shot_df = shot_df.copy()
shot_df['age'] = shot_df['age'].astype(int)
shot_df.fillna(0,inplace=True)
len(shot_df), shot_df.isna().sum().sum()

(4625, 0)

In [43]:
shot_df.to_csv('Shooting_Leagues2_23_24.csv', index=False)

In [44]:
input_directory = "/kaggle/input/to-clean/GK"
gk_df = combine_csv_files(input_directory)
gk_df.fillna(0, inplace=True)
gk_df['age'] = gk_df['age'].astype(int)
len(gk_df), gk_df.isna().sum().sum()

(327, 0)

In [45]:
gk_df.to_csv('GK_Leagues2_23_24.csv', index=False)

In [46]:
input_directory = "/kaggle/input/to-clean/GK_adv"
gk_adv_df = combine_csv_files(input_directory)
gk_adv_df.fillna(0, inplace=True)
gk_adv_df['age'] = gk_adv_df['age'].astype(int)
len(gk_adv_df), gk_adv_df.isna().sum().sum()

(327, 0)

In [47]:
gk_adv_df.to_csv('GK_adv_Leagues2_23_24.csv', index=False)

In [48]:
input_directory = "/kaggle/input/to-clean/Playing Time"
pt_df = combine_csv_files(input_directory)
len(pt_df), pt_df.isna().sum()

(5378,
 player                    0
 nationality              34
 position                  2
 age                      31
 club                      0
 league                    0
 games                     0
 minutes                 745
 minutes_per_game        745
 minutes_pct             740
 minutes_90s             740
 games_starts              0
 minutes_per_start      1317
 games_complete            0
 games_subs                0
 minutes_per_sub        1419
 unused_subs               0
 points_per_game         736
 on_goals_for            740
 on_goals_against        740
 plus_minus              740
 plus_minus_per90        745
 plus_minus_wowy         770
 on_xg_for               753
 on_xg_against           753
 xg_plus_minus           753
 xg_plus_minus_per90     753
 xg_plus_minus_wowy      778
 dtype: int64)

In [49]:
# Remove rows where 'minutes' column has NaN values
pt_df = pt_df[~pt_df['minutes'].isna()]
pt_df.reset_index(drop=True, inplace=True)
len(pt_df), pt_df.isna().sum()

(4633,
 player                   0
 nationality             14
 position                 0
 age                     10
 club                     0
 league                   0
 games                    0
 minutes                  0
 minutes_per_game         0
 minutes_pct              0
 minutes_90s              0
 games_starts             0
 minutes_per_start      572
 games_complete           0
 games_subs               0
 minutes_per_sub        674
 unused_subs              0
 points_per_game          0
 on_goals_for             0
 on_goals_against         0
 plus_minus               0
 plus_minus_per90         0
 plus_minus_wowy         25
 on_xg_for                8
 on_xg_against            8
 xg_plus_minus            8
 xg_plus_minus_per90      8
 xg_plus_minus_wowy      33
 dtype: int64)

In [50]:
# Remove rows where 'minutes' column has NaN values
pt_df = pt_df[~pt_df['on_xg_for'].isna()]
pt_df.reset_index(drop=True, inplace=True)
len(pt_df), pt_df.isna().sum()

(4625,
 player                   0
 nationality             14
 position                 0
 age                     10
 club                     0
 league                   0
 games                    0
 minutes                  0
 minutes_per_game         0
 minutes_pct              0
 minutes_90s              0
 games_starts             0
 minutes_per_start      569
 games_complete           0
 games_subs               0
 minutes_per_sub        672
 unused_subs              0
 points_per_game          0
 on_goals_for             0
 on_goals_against         0
 plus_minus               0
 plus_minus_per90         0
 plus_minus_wowy         25
 on_xg_for                0
 on_xg_against            0
 xg_plus_minus            0
 xg_plus_minus_per90      0
 xg_plus_minus_wowy      25
 dtype: int64)

In [51]:
# Filling Age and Nationality
pt_df[(pt_df['age'].isna()) | (pt_df['nationality'].isna())]

Unnamed: 0,player,nationality,position,age,club,league,games,minutes,minutes_per_game,minutes_pct,minutes_90s,games_starts,minutes_per_start,games_complete,games_subs,minutes_per_sub,unused_subs,points_per_game,on_goals_for,on_goals_against,plus_minus,plus_minus_per90,plus_minus_wowy,on_xg_for,on_xg_against,xg_plus_minus,xg_plus_minus_per90,xg_plus_minus_wowy
32,Abraham Villegas,,DF,20.0,Toluca,Liga MX,6,329,55.0,10.8,3.7,4,74.0,1,2,17.0,8,1.17,3.0,6.0,-3.0,-0.82,-1.55,4.8,5.3,-0.5,-0.13,-0.53
144,Alejandro Lozano,,MF,,Sporting Gijón,Segunda Division,6,73,12.0,1.9,0.8,0,,0,6,12.0,12,0.67,0.0,1.0,-1.0,-1.23,-1.48,0.5,0.6,0.0,-0.03,0.0
179,Alessio da Cruz,,FW,26.0,FeralpiSalò,Serie B,1,14,14.0,0.4,0.2,0,,0,1,14.0,2,0.0,0.0,0.0,0.0,0.0,0.55,0.0,0.2,-0.2,-1.14,-0.78
529,Ben Hatton,,MF,,Rotherham Utd,EFL,3,5,2.0,0.1,0.1,0,,0,3,2.0,5,1.0,0.0,0.0,0.0,0.0,1.13,0.4,0.0,0.4,6.48,7.53
576,Bobby Thomas,ENG,DF,,Coventry City,EFL,44,3759,85.0,90.8,41.8,42,88.0,40,2,25.0,2,1.43,67.0,51.0,16.0,0.38,1.56,59.4,50.6,8.8,0.21,0.42
848,Connor O'Riordan,IRL,DF,,Blackburn,EFL,2,42,21.0,1.0,0.5,0,,0,2,21.0,7,1.5,0.0,1.0,-1.0,-2.14,-1.86,0.4,0.5,-0.1,-0.14,-0.06
1305,Eren Öztürk,,MF,19.0,Karlsruher,2. Bundesliga,2,4,2.0,0.1,0.0,0,,0,2,2.0,14,1.5,1.0,0.0,1.0,22.5,21.94,0.3,0.2,0.1,1.8,1.57
1618,Gerard Buabo,,FW,,Ipswich Town,EFL,1,4,4.0,0.1,0.0,0,,0,1,4.0,0,1.0,0.0,0.0,0.0,0.0,-0.76,0.0,0.0,0.0,0.47,-0.12
1987,James Wilson,,DF,,Middlesbrough,EFL,1,1,1.0,0.0,0.0,0,,0,1,1.0,0,3.0,0.0,0.0,0.0,0.0,-0.2,0.0,0.0,0.0,0.0,-0.32
2303,Joseph James,,DF,,Bristol City,EFL,2,77,39.0,1.9,0.9,1,72.0,0,1,5.0,3,0.0,0.0,1.0,-1.0,-1.17,-1.24,0.9,0.7,0.2,0.19,0.26


In [52]:
pt_df = fill_age_nat(pt_df)
pt_df = pt_df.copy()
pt_df['age'] = pt_df['age'].astype(int)
pt_df.fillna(0,inplace=True)
len(pt_df), pt_df.isna().sum().sum()

(4625, 0)

In [53]:
pt_df.to_csv('Playing_Time_Leagues2_23_24.csv', index=False)