### Concatenate all data players from all leagues

In [1]:
import pandas as pd
import glob
import os

# Path to the folder containing the CSV files
folder_path = 'players_data_clean/'

# Get list of all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Read and concatenate all CSV files into a single DataFrame
df_combined = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)

# Check the resulting DataFrame
df_combined

Unnamed: 0,Player,Birthdate,League,Club,Footed,Nationality,Position,Minutes,Goals,Assists,...,Fouls Committed,Fouls Drawn,Offsides,Penalty Kicks Won,Penalty Kicks Conceded,Own Goals,Ball Recoveries,Aerials Won,Aerials Lost,% of Aerials Won
0,Amir Rrahmani,1994-02-24,SerieA,Napoli,Right,Kosovo,Center Backs,3406,0.03,0.08,...,1.11,0.42,0.00,0.00,0.00,0.03,3.57,3.28,1.45,69.3
1,Giovanni Di Lorenzo,1993-08-04,SerieA,Napoli,Right,Italy,Fullbacks,3330,0.08,0.05,...,1.24,1.38,0.14,0.00,0.00,0.00,2.54,1.22,1.24,49.5
2,Romelu Lukaku,1993-05-13,SerieA,Napoli,Left,Belgium,Forwards,2843,0.44,0.32,...,1.49,1.14,0.76,0.00,0.00,0.00,1.08,1.33,1.77,42.9
3,Matteo Politano,1993-08-03,SerieA,Napoli,Left,Italy,Att Mid / Wingers,2804,0.10,0.13,...,0.45,1.00,0.19,0.03,0.00,0.00,3.05,0.19,0.64,23.1
4,Scott McTominay,1996-12-08,SerieA,Napoli,Right,Scotland,Midfielders,2938,0.37,0.12,...,1.38,1.84,0.15,0.03,0.00,0.00,4.38,2.02,1.53,56.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2432,Yukinari Sugawara,2000-06-28,PremierLeague,Southampton,Right,Japan,Fullbacks,1557,0.06,0.06,...,0.87,0.58,0.23,0.00,0.12,0.00,2.54,0.29,0.52,35.7
2433,Adam Armstrong,1997-02-10,PremierLeague,West Bromwich Albion,Right,England,Forwards,1248,0.14,0.14,...,0.87,0.36,0.65,0.00,0.00,0.00,1.66,0.36,0.65,35.7
2434,James Bree,1997-10-11,PremierLeague,Southampton,Right,England,Center Backs,1075,0.00,0.08,...,0.33,0.42,0.00,0.00,0.00,0.00,5.44,1.00,0.59,63.2
2435,Paul Onuachu,1994-05-28,PremierLeague,Southampton,Right,Nigeria,Forwards,1050,0.34,0.09,...,2.40,1.37,0.86,0.00,0.00,0.00,1.63,7.97,5.06,61.2


# Dim Reduction (2/3) - 95 variance du dataset 
# Kmeans 
# ELbow method - WithinClusterSumSquares - Silouhette 


# Distance / Cosine Sim / Gamma Mixture 

In [None]:
# Check for NaN values in each column
nan_counts = df_combined.isna().sum()

# Display columns with NaNs and their counts
nan_columns = nan_counts[nan_counts > 0]

if nan_columns.empty:
    print("No NaN values found!")
else:
    print("NaN values detected:")
    print(nan_columns)

No NaN values found!


### Merge clubs' elos for modelling 

In [10]:
elo_df = pd.read_csv("elo_clubs/Elo_Club_VF.csv", sep=';')

In [13]:
from fuzzywuzzy import process

def get_best_match(name, choices):
    result = process.extractOne(name, choices)
    return result[0] 

# Apply fuzzy matching to club names with the same 
df_combined["Club_fuzzy"] = df_combined["Club"].apply(
    lambda x: get_best_match(x, elo_df["Club"].unique())
)

# Merge the matched names
final_df = pd.merge(df_combined, elo_df, left_on='Club_fuzzy', right_on='Club', how='left')
final_df.drop(columns=['Club_fuzzy', 'Club_y'], inplace=True)
final_df.rename(columns={'Club_x': 'Club'}, inplace=True)

In [14]:
final_df

Unnamed: 0,Player,Birthdate,League,Club,Footed,Nationality,Position,Minutes,Goals,Assists,...,Fouls Drawn,Offsides,Penalty Kicks Won,Penalty Kicks Conceded,Own Goals,Ball Recoveries,Aerials Won,Aerials Lost,% of Aerials Won,Elo
0,Amir Rrahmani,1994-02-24,SerieA,Napoli,Right,Kosovo,Center Backs,3406,0.03,0.08,...,0.42,0.00,0.00,0.00,0.03,3.57,3.28,1.45,69.3,1838
1,Giovanni Di Lorenzo,1993-08-04,SerieA,Napoli,Right,Italy,Fullbacks,3330,0.08,0.05,...,1.38,0.14,0.00,0.00,0.00,2.54,1.22,1.24,49.5,1838
2,Romelu Lukaku,1993-05-13,SerieA,Napoli,Left,Belgium,Forwards,2843,0.44,0.32,...,1.14,0.76,0.00,0.00,0.00,1.08,1.33,1.77,42.9,1838
3,Matteo Politano,1993-08-03,SerieA,Napoli,Left,Italy,Att Mid / Wingers,2804,0.10,0.13,...,1.00,0.19,0.03,0.00,0.00,3.05,0.19,0.64,23.1,1838
4,Scott McTominay,1996-12-08,SerieA,Napoli,Right,Scotland,Midfielders,2938,0.37,0.12,...,1.84,0.15,0.03,0.00,0.00,4.38,2.02,1.53,56.9,1838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2432,Yukinari Sugawara,2000-06-28,PremierLeague,Southampton,Right,Japan,Fullbacks,1557,0.06,0.06,...,0.58,0.23,0.00,0.12,0.00,2.54,0.29,0.52,35.7,1782
2433,Adam Armstrong,1997-02-10,PremierLeague,West Bromwich Albion,Right,England,Forwards,1248,0.14,0.14,...,0.36,0.65,0.00,0.00,0.00,1.66,0.36,0.65,35.7,1538
2434,James Bree,1997-10-11,PremierLeague,Southampton,Right,England,Center Backs,1075,0.00,0.08,...,0.42,0.00,0.00,0.00,0.00,5.44,1.00,0.59,63.2,1782
2435,Paul Onuachu,1994-05-28,PremierLeague,Southampton,Right,Nigeria,Forwards,1050,0.34,0.09,...,1.37,0.86,0.00,0.00,0.00,1.63,7.97,5.06,61.2,1782


In [16]:
# Check for NaN values in each column
nan_counts = final_df.isna().sum()

# Display columns with NaNs and their counts
nan_columns = nan_counts[nan_counts > 0]

if nan_columns.empty:
    print("No NaN values found!")
else:
    print("NaN values detected:")
    print(nan_columns)

No NaN values found!


In [None]:
final_df[final_df['League']==""]