In [1]:
#Importing libraries
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
#Loading the dataset
df    = pd.read_csv('/kaggle/input/fbref-player-dataset-2024/statistics.csv')
gk_df = pd.read_csv('/kaggle/input/fbref-player-dataset-2024/gk_statistics.csv')

In [3]:
df.head()

Unnamed: 0,Season,Age,Squad,Country,Comp,LgRank,MP,Playing Time_Starts,Playing Time_Min,Playing Time_90s,...,Aerial Duels_Won%,Name,position,footed,height,weight,age,national_team,current_club,Playing Time_MP
0,2010-2011,17,Basel,ch SUI,1. Super Lg,1st,20.0,11.0,1136.0,12.6,...,,Granit Xhaka,MF,Left,185.0,82.0,31.0,,Leverkusen\nWages,
1,2011-2012,18,Basel,ch SUI,1. Super Lg,1st,24.0,18.0,1595.0,17.7,...,,Granit Xhaka,MF,Left,185.0,82.0,31.0,,Leverkusen\nWages,
2,2012-2013,19,Gladbach,de GER,1. Bundesliga,8th,22.0,15.0,1477.0,16.4,...,,Granit Xhaka,MF,Left,185.0,82.0,31.0,,Leverkusen\nWages,
3,2013-2014,20,Gladbach,de GER,1. Bundesliga,6th,28.0,23.0,2023.0,22.5,...,,Granit Xhaka,MF,Left,185.0,82.0,31.0,,Leverkusen\nWages,
4,2014-2015,21,Gladbach,de GER,1. Bundesliga,3rd,30.0,30.0,2637.0,29.3,...,,Granit Xhaka,MF,Left,185.0,82.0,31.0,,Leverkusen\nWages,


In [4]:
df = df[df['MP'] > 0]

In [5]:
df[['MP', 'Playing Time_Starts', 'Playing Time_Min', 'Subs_Subs']]

Unnamed: 0,MP,Playing Time_Starts,Playing Time_Min,Subs_Subs
0,20.0,11.0,1136.0,9.0
1,24.0,18.0,1595.0,6.0
2,22.0,15.0,1477.0,7.0
3,28.0,23.0,2023.0,5.0
4,30.0,30.0,2637.0,0.0
...,...,...,...,...
25624,1.0,1.0,66.0,0.0
25625,1.0,0.0,8.0,1.0
25626,29.0,20.0,1821.0,9.0
25627,10.0,2.0,249.0,8.0


In [6]:
#Columns to remove : Matches, national_team, Playing Time_MP, Subs_Mn/Sub, LgRank

In [7]:
df.drop(columns=['Matches', 'national_team', 'Playing Time_MP', 'Subs_Mn/Sub', 'LgRank', '90s', 'Starts_Starts'], inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23344 entries, 0 to 25629
Columns: 179 entries, Season to current_club
dtypes: float64(170), object(9)
memory usage: 32.1+ MB


In [9]:
print(df.columns.tolist())

['Season', 'Age', 'Squad', 'Country', 'Comp', 'MP', 'Playing Time_Starts', 'Playing Time_Min', 'Playing Time_90s', 'Performance_Gls', 'Performance_Ast', 'Performance_G+A', 'Performance_G-PK', 'Performance_PK', 'Performance_PKatt', 'Performance_CrdY', 'Performance_CrdR', 'Expected_xG', 'Expected_npxG', 'Expected_xAG', 'Expected_npxG+xAG', 'Progression_PrgC', 'Progression_PrgP', 'Progression_PrgR', 'Per 90 Minutes_Gls', 'Per 90 Minutes_Ast', 'Per 90 Minutes_G+A', 'Per 90 Minutes_G-PK', 'Per 90 Minutes_G+A-PK', 'Per 90 Minutes_xG', 'Per 90 Minutes_xAG', 'Per 90 Minutes_xG+xAG', 'Per 90 Minutes_npxG', 'Per 90 Minutes_npxG+xAG', 'Standard_Gls', 'Standard_Sh', 'Standard_SoT', 'Standard_SoT%', 'Standard_Sh/90', 'Standard_SoT/90', 'Standard_G/Sh', 'Standard_G/SoT', 'Standard_Dist', 'Standard_FK', 'Standard_PK', 'Standard_PKatt', 'Expected_npxG/Sh', 'Expected_G-xG', 'Expected_np:G-xG', 'Total_Cmp', 'Total_Att', 'Total_Cmp%', 'Total_TotDist', 'Total_PrgDist', 'Short_Cmp', 'Short_Att', 'Short_Cmp

In [10]:
df['Season'].value_counts()

Season
2023-2024                    3114
2022-2023                    2731
2021-2022                    2467
2020-2021                    2128
2019-2020                    1942
                             ... 
2002-2003                       1
2008                            1
Premier League (1 Season)       1
RB Leipzig U17 (1 Season)       1
Darmstadt 98 (1 Season)         1
Name: count, Length: 69, dtype: int64

In [11]:
seasons = df['Season'].unique().tolist()

In [12]:
seasons_to_remove = []
for season in seasons:
    split = season.split('-')
    if((len(split) == 2) or (split[0].isnumeric())):
        continue
    else:
        seasons_to_remove.append(season)

In [13]:
seasons_to_remove

['1 Season',
 'Liverpool (1 Season)',
 'PL2 (1 Season)',
 'Premier League (1 Season)',
 'FC Bayern U17 (1 Season)',
 'Bayern U19 (1 Season)',
 'Bayern Munich (1 Season)',
 'U17 Bundesliga (1 Season)',
 'U19 Bundesliga (1 Season)',
 'Bundesliga (1 Season)',
 'Leipzig U19 (1 Season)',
 'RB Leipzig U17 (1 Season)',
 'RB Leipzig (1 Season)',
 'Bremen U19 (1 Season)',
 'Viktoria Berlin U19 (1 Season)',
 'Werder Bremen (1 Season)',
 'Hertha U17 (1 Season)',
 'Leverkusen U17 (1 Season)',
 'Mainz 05 (1 Season)',
 'Gladbach (1 Season)',
 'Gladbach U19 (1 Season)',
 'Udinese (1 Season)',
 'Serie A (1 Season)',
 'St. Pauli U17 (1 Season)',
 'Union U19 (1 Season)',
 'Reading FC U21 (1 Season)',
 'Darmstadt (1 Season)',
 'Darmstadt 98 U19 (1 Season)',
 'Darmstadt 98 (1 Season)']

In [14]:
df['Season'].value_counts()[seasons_to_remove]

Season
1 Season                          70
Liverpool (1 Season)               2
PL2 (1 Season)                     2
Premier League (1 Season)          1
FC Bayern U17 (1 Season)           2
Bayern U19 (1 Season)              2
Bayern Munich (1 Season)           1
U17 Bundesliga (1 Season)          6
U19 Bundesliga (1 Season)          7
Bundesliga (1 Season)              6
Leipzig U19 (1 Season)             2
RB Leipzig U17 (1 Season)          1
RB Leipzig (1 Season)              2
Bremen U19 (1 Season)              1
Viktoria Berlin U19 (1 Season)     1
Werder Bremen (1 Season)           1
Hertha U17 (1 Season)              1
Leverkusen U17 (1 Season)          1
Mainz 05 (1 Season)                1
Gladbach (1 Season)                1
Gladbach U19 (1 Season)            1
Udinese (1 Season)                 1
Serie A (1 Season)                 1
St. Pauli U17 (1 Season)           1
Union U19 (1 Season)               1
Reading FC U21 (1 Season)          1
Darmstadt (1 Season)           

In [15]:
df = df[~df['Season'].isin(seasons_to_remove)]

In [16]:
names = df[df['Age'].isna()]['Name'].tolist()

In [17]:
fill_age = df[df['Name'].isin(names)][['Name', 'Season', 'Age']]

In [18]:
def fill_missing_ages(group):
    reference_row = group.dropna(subset=['Age', 'Season']).iloc[0] if not group.dropna(subset=['Age', 'Season']).empty else None
    
    if reference_row is not None:
        ref_age = reference_row['Age']
        ref_season = int(reference_row['Season'].split('-')[0])
        
        for idx, row in group.iterrows():
            if pd.isna(row['Age']):
                season = int(row['Season'].split('-')[0])
                group.at[idx, 'Age'] = int(ref_age) + (season - ref_season)
    
    return group

In [19]:
result = fill_age.groupby('Name').apply(fill_missing_ages).reset_index(drop=True)

  result = fill_age.groupby('Name').apply(fill_missing_ages).reset_index(drop=True)


In [20]:
result.dropna(subset=['Age'], inplace=True)

In [21]:
for idx, row in result.iterrows():
    name   = row['Name']
    season = row['Season']
    age    = row['Age']
    df.loc[(df['Name'] == name) & (df['Season'] == season), 'Age'] = age

In [22]:
names = df[df['Age'].isna()]['Name'].tolist()

In [23]:
df = df[~df['Name'].isin(names)]

In [24]:
df.isna().sum().sort_values(ascending=False)

Standard_Dist         10484
Expected_npxG/Sh      10484
Take-Ons_Succ%        10335
Take-Ons_Tkld%        10335
Challenges_Tkl%       10212
                      ...  
Performance_CrdR          0
Per 90 Minutes_Gls        0
Age                       0
Playing Time_Mn/MP        0
Season                    0
Length: 179, dtype: int64

In [25]:
df.isna().sum().sum() / (df.shape[0] * df.shape[1])

0.28903989608086517

In [26]:
df.shape

(23211, 179)

In [27]:
minutes_df = df[['Name', 'Playing Time_Min']].groupby('Name').sum()

In [28]:
remove_players = minutes_df[minutes_df['Playing Time_Min'] < 800].index.tolist()

In [29]:
df = df[~df['Name'].isin(remove_players)]

In [30]:
df[['Name', 'position', 'footed', 'height', 'weight', 'age', 'current_club']].isna().sum()

Name               0
position           0
footed          2062
height          1242
weight          1529
age             1099
current_club    1328
dtype: int64

In [31]:
df['Season'].unique()

array(['2010-2011', '2011-2012', '2012-2013', '2013-2014', '2014-2015',
       '2015-2016', '2016-2017', '2017-2018', '2018-2019', '2019-2020',
       '2020-2021', '2021-2022', '2022-2023', '2023-2024', '2024-2025',
       '2015', '2016', '2019', '2020', '2021', '2022', '2017', '2018',
       '2007-2008', '2008-2009', '2009-2010', '2006-2007', '2014', '2023',
       '2024', '2009', '2010', '2011', '2012', '2013', '2004-2005',
       '2005-2006', '2003-2004', '2008', '2002-2003'], dtype=object)

In [32]:
def split_season(col):
    if(len(col.split('-')) == 2):
        return int(col.split('-')[1])
    else:
        return int(col.split('-')[0])

In [33]:
def clean_position(col):
    splits = col.split('-')
    for i in range(len(splits)):
        if(len(splits[i]) == 3):
            splits[i] = splits[i][:2]
    return '-'.join(splits)

In [34]:
df['season-end'] = df['Season'].apply(split_season)
df['Country'] = df['Country'].apply(lambda country : country.split()[1])
df['Tier'] = df['Comp'].apply(lambda comp : comp.split('.')[0])
name_squad = df.groupby('Name')['Squad'].last().to_dict()
df['current_club'] = df['Name'].map(name_squad)
df.loc[df['current_club'] == 'Tottenham', 'current_club'] = 'Tottenham Hotspur'
df.loc[df['Tier'] == 'Jr', 'Tier'] = 6
df['Tier'] = df['Tier'].apply(lambda x : int(x))
name_comp = df.groupby('Name')['Comp'].last().to_dict()
for key in name_comp.keys():
    name_comp[key] = name_comp[key].split('.')[1].strip()
df['League'] = df['Name'].map(name_comp)
df['Age'] = df['Age'].apply(lambda x : int(x))
df['Position'] = df['position'].apply(clean_position)
df['FW'] = df['Position'].apply(lambda x: 1 if 'FW' in x else 0)
df['MF'] = df['Position'].apply(lambda x: 1 if 'MF' in x else 0)
df['DF'] = df['Position'].apply(lambda x: 1 if 'DF' in x else 0)

In [35]:
personal_df = df[['Name', 'Position', 'age', 'footed', 'height', 'weight', 'current_club', 'League', 'Tier', 'FW', 'MF', 'DF']].groupby('Name').last().reset_index()

In [36]:
playing_time_df = df[['Name', 'MP', 'Playing Time_Starts', 'Playing Time_Min','Playing Time_90s','Playing Time_Mn/MP', 
                      'Playing Time_Min%', 'Starts_Mn/Start', 'Starts_Compl', 'Subs_Subs', 'Subs_unSub']]

In [37]:
team_perf_df = df[['Name', 'Team Success_PPM', 'Team Success_onG', 'Team Success_onGA', 'Team Success_+/-',
 'Team Success_+/-90', 'Team Success_On-Off', 'Team Success (xG)_onxG', 'Team Success (xG)_onxGA', 'Team Success (xG)_xG+/-',
 'Team Success (xG)_xG+/-90', 'Team Success (xG)_On-Off']]

In [38]:
cautions_df = df[['Name', 'Performance_CrdY', 'Performance_CrdR', 'Performance_2CrdY', 'Performance_Fls', 
                  'Performance_Off', 'Performance_PKcon', 'Performance_OG', 'Err']]

In [39]:
cautions_df.groupby('Name').sum().sort_values(by='Performance_Off', ascending=False)['Performance_Off']

Name
Timo Werner          242.0
Ciro Immobile        234.0
Álvaro Morata        220.0
Kylian Mbappé        208.0
Gianluca Lapadula    207.0
                     ...  
Beñat Prados           0.0
Luca Pellegrini        0.0
Luca Raimund           0.0
Lucas Beraldo          0.0
Lilian Egloff          0.0
Name: Performance_Off, Length: 2407, dtype: float64

In [40]:
remove_cols = list(set(personal_df.columns.tolist() + playing_time_df.columns.tolist() + team_perf_df.columns.tolist() + cautions_df.columns.tolist()) - set(['Name', 'Playing Time_Min']))

In [41]:
df.drop(columns=['Season', 'Age', 'Squad', 'Comp', 'position', 'Country'], inplace=True)

In [42]:
stats_df = df.drop(columns=remove_cols)

In [43]:
stats_df.head()

Unnamed: 0,Playing Time_Min,Performance_Gls,Performance_Ast,Performance_G+A,Performance_G-PK,Performance_PK,Performance_PKatt,Expected_xG,Expected_npxG,Expected_xAG,...,Performance_Crs,Performance_Int,Performance_TklW,Performance_PKwon,Performance_Recov,Aerial Duels_Won,Aerial Duels_Lost,Aerial Duels_Won%,Name,season-end
0,1136.0,1.0,3.0,4.0,1.0,0.0,0.0,,,,...,,,,,,,,,Granit Xhaka,2011
1,1595.0,1.0,0.0,1.0,1.0,0.0,0.0,,,,...,,,,,,,,,Granit Xhaka,2012
2,1477.0,1.0,0.0,1.0,1.0,0.0,0.0,,,,...,,,,,,,,,Granit Xhaka,2013
3,2023.0,0.0,1.0,1.0,0.0,0.0,0.0,,,,...,,,,,,,,,Granit Xhaka,2014
4,2637.0,2.0,1.0,3.0,2.0,0.0,0.0,,,,...,,,,,,,,,Granit Xhaka,2015


In [44]:
stats_df.drop(columns=['Name']).min().sort_values()

Expected_G-xG         -8.7
Expected_A-xAG        -8.7
Expected_np:G-xG      -8.5
Pass Types_TI          0.0
GCA Types_TO           0.0
                     ...  
Long_Att               0.0
Aerial Duels_Won       0.0
Playing Time_Min       1.0
Standard_Dist          1.2
season-end          2003.0
Length: 140, dtype: float64

In [45]:
stats_df.isna().sum().sort_values(ascending=False)

Expected_npxG/Sh      10161
Standard_Dist         10161
Take-Ons_Tkld%        10047
Take-Ons_Succ%        10047
Challenges_Tkl%        9927
                      ...  
Name                      0
Playing Time_Min          0
Performance_Gls           0
Per 90 Minutes_Gls        0
season-end                0
Length: 141, dtype: int64

In [46]:
overall_playing = playing_time_df.groupby('Name').sum()

In [47]:
overall_playing.head()

Unnamed: 0_level_0,MP,Playing Time_Starts,Playing Time_Min,Playing Time_90s,Playing Time_Mn/MP,Playing Time_Min%,Starts_Mn/Start,Starts_Compl,Subs_Subs,Subs_unSub
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Aaron Cresswell,497.0,473.0,42376.0,470.8,1315.0,1150.3,514.0,253.0,24.0,39.0
Aaron Hickey,106.0,99.0,8234.0,91.4,447.0,255.7,494.0,51.0,7.0,14.0
Aaron Ramsey,86.0,62.0,5153.0,57.3,441.0,176.9,525.0,21.0,24.0,7.0
Aaron Seydel,162.0,49.0,5928.0,66.0,319.0,184.4,322.0,26.0,113.0,29.0
Aaron Wan-Bissaka,174.0,167.0,15018.0,166.8,665.0,479.7,624.0,154.0,7.0,32.0


In [48]:
overall_playing.describe()

Unnamed: 0,MP,Playing Time_Starts,Playing Time_Min,Playing Time_90s,Playing Time_Mn/MP,Playing Time_Min%,Starts_Mn/Start,Starts_Compl,Subs_Subs,Subs_unSub
count,2407.0,2407.0,2407.0,2407.0,2407.0,2407.0,2407.0,2407.0,2407.0,2407.0
mean,179.019526,139.238056,12447.449107,138.306315,596.340258,421.778106,517.878687,87.788533,39.781471,29.852514
std,114.099955,100.053217,8845.074453,98.277484,327.709122,263.359049,189.618349,76.054114,29.882501,21.73513
min,11.0,5.0,805.0,9.0,53.0,22.0,0.0,0.0,0.0,0.0
25%,86.5,57.0,5174.0,57.4,334.5,212.05,400.0,27.0,17.0,14.0
50%,162.0,120.0,10744.0,119.4,560.0,382.2,538.0,66.0,33.0,26.0
75%,249.0,198.0,17696.5,196.65,802.0,584.2,638.0,127.0,55.0,42.0
max,643.0,569.0,49872.0,554.1,2175.0,1576.2,1740.0,511.0,204.0,175.0


In [49]:
overall_playing.head()

Unnamed: 0_level_0,MP,Playing Time_Starts,Playing Time_Min,Playing Time_90s,Playing Time_Mn/MP,Playing Time_Min%,Starts_Mn/Start,Starts_Compl,Subs_Subs,Subs_unSub
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Aaron Cresswell,497.0,473.0,42376.0,470.8,1315.0,1150.3,514.0,253.0,24.0,39.0
Aaron Hickey,106.0,99.0,8234.0,91.4,447.0,255.7,494.0,51.0,7.0,14.0
Aaron Ramsey,86.0,62.0,5153.0,57.3,441.0,176.9,525.0,21.0,24.0,7.0
Aaron Seydel,162.0,49.0,5928.0,66.0,319.0,184.4,322.0,26.0,113.0,29.0
Aaron Wan-Bissaka,174.0,167.0,15018.0,166.8,665.0,479.7,624.0,154.0,7.0,32.0


In [50]:
(overall_playing['Subs_Subs'] / overall_playing['MP']).sort_values(ascending=False)

Name
Mathys Tel             0.866667
Samuel Iling-Junior    0.861111
Pablo Torre            0.857143
Luka Romero            0.829268
Pietro Pellegri        0.808989
                         ...   
Mohamed Jaouab         0.000000
Joshua King            0.000000
Ben Greenwood          0.000000
Tyrese Hall            0.000000
Kim Jisoo              0.000000
Length: 2407, dtype: float64

In [51]:
less_imp_stats = ['Performance_PKatt', 'Standard_Gls', 'Standard_PK', 'Standard_PKatt', 'Ast', 'xAG', 'Progression_PrgC',
                  'Progression_PrgP', 'Progression_PrgR', 'Att', 'Performance_Crs', 'Performance_Int', 
                  'Performance_TklW', 'Outcomes_Cmp', 'Outcomes_Off', 'Outcomes_Blocks', 'Challenges_Lost',
                 'Carries_Mis', 'Carries_Dis', 'Aerial Duels_Lost', 'Take-Ons_Tkld', 'Take-Ons_Tkld%']

In [52]:
stats_df.drop(columns=less_imp_stats,inplace=True)

In [53]:
stats_df.head()

Unnamed: 0,Playing Time_Min,Performance_Gls,Performance_Ast,Performance_G+A,Performance_G-PK,Performance_PK,Expected_xG,Expected_npxG,Expected_xAG,Expected_npxG+xAG,...,Carries_CPA,Receiving_Rec,Receiving_PrgR,Performance_Fld,Performance_PKwon,Performance_Recov,Aerial Duels_Won,Aerial Duels_Won%,Name,season-end
0,1136.0,1.0,3.0,4.0,1.0,0.0,,,,,...,,,,,,,,,Granit Xhaka,2011
1,1595.0,1.0,0.0,1.0,1.0,0.0,,,,,...,,,,,,,,,Granit Xhaka,2012
2,1477.0,1.0,0.0,1.0,1.0,0.0,,,,,...,,,,,,,,,Granit Xhaka,2013
3,2023.0,0.0,1.0,1.0,0.0,0.0,,,,,...,,,,,,,,,Granit Xhaka,2014
4,2637.0,2.0,1.0,3.0,2.0,0.0,,,,,...,,,,,,,,,Granit Xhaka,2015


In [61]:
final_stats = ['Per 90 Minutes_Gls', 'Per 90 Minutes_Ast', 'Per 90 Minutes_G+A', 'Per 90 Minutes_G-PK', 
               'Per 90 Minutes_G+A-PK', 'Per 90 Minutes_xG', 'Per 90 Minutes_xAG', 'Per 90 Minutes_xG+xAG', 
               'Per 90 Minutes_npxG', 'Per 90 Minutes_npxG+xAG', 'Standard_SoT%', 'Standard_Sh/90', 
               'Standard_SoT/90', 'Standard_G/Sh', 'Standard_G/SoT', 'Playing Time_Min', 'Standard_G/Sh', 
               'Standard_G/SoT', 'Standard_Dist', 'Standard_FK', 'Expected_npxG/Sh', 'Expected_G-xG', 
               'Expected_np:G-xG', 'Total_Cmp', 'Total_Cmp%', 'Total_PrgDist', 'Short_Cmp', 'Short_Cmp%', 
               'Medium_Cmp', 'Medium_Cmp%', 'Long_Cmp', 'Long_Cmp%', 'Blocks_Blocks',
              'Expected_xA', 'Expected_A-xAG', 'KP', '1/3', 'PPA', 'CrsPA', 'PrgP', 'Pass Types_Live', 
               'Pass Types_Dead', 'Pass Types_FK', 'Pass Types_TB', 'Pass Types_Sw', 'Pass Types_Crs', 
               'Pass Types_TI', 'Pass Types_CK', 'Corner Kicks_In', 'Corner Kicks_Out', 'Corner Kicks_Str',
              'SCA_SCA90', 'SCA Types_PassLive', 'SCA Types_PassDead', 'SCA Types_TO', 'SCA Types_Sh', 
               'SCA Types_Fld', 'SCA Types_Def', 'GCA_GCA90', 'GCA Types_PassLive', 'GCA Types_PassDead', 
               'GCA Types_TO', 'GCA Types_Sh', 'GCA Types_Fld', 'GCA Types_Def', 'Tackles_Tkl', 'Tackles_TklW',
              'Tackles_Def 3rd', 'Tackles_Mid 3rd', 'Tackles_Att 3rd', 'Challenges_Tkl', 'Challenges_Tkl%',
               'Blocks_Pass', 'Int', 'Tkl+Int', 'Clr', 'Touches_Touches', 'Touches_Def Pen', 'Touches_Def 3rd', 
               'Touches_Mid 3rd', 'Touches_Att 3rd', 'Touches_Att Pen', 'Touches_Live', 'Take-Ons_Succ', 'Take-Ons_Succ%', 
               'Carries_Carries', 'Carries_TotDist', 'Carries_PrgDist', 'Carries_PrgC', 'Carries_1/3', 'Carries_CPA', 
               'Receiving_Rec', 'Receiving_PrgR', 'Performance_Fld', 'Performance_PKwon', 'Performance_Recov', 
               'Aerial Duels_Won', 'Aerial Duels_Won%', 'Name', 'season-end']

In [73]:
final_df = stats_df[final_stats]

In [74]:
final_df.shape

(22725, 100)

In [78]:
final_df.isna().sum().sort_values(ascending=False).head(20)

Expected_npxG/Sh     10161
Standard_Dist        10161
Take-Ons_Succ%       10047
Challenges_Tkl%       9927
Aerial Duels_Won%     9567
Long_Cmp%             9530
Medium_Cmp%           9118
Short_Cmp%            9064
Total_Cmp%            8994
Expected_np:G-xG      8977
Expected_G-xG         8977
Standard_FK           8977
Tackles_Def 3rd       8946
Challenges_Tkl        8946
Tackles_Att 3rd       8946
Tackles_Mid 3rd       8946
Corner Kicks_Str      8946
Blocks_Pass           8946
GCA Types_Def         8946
GCA Types_Fld         8946
dtype: int64

In [63]:
final_df.head()

Unnamed: 0,Per 90 Minutes_Gls,Per 90 Minutes_Ast,Per 90 Minutes_G+A,Per 90 Minutes_G-PK,Per 90 Minutes_G+A-PK,Per 90 Minutes_xG,Per 90 Minutes_xAG,Per 90 Minutes_xG+xAG,Per 90 Minutes_npxG,Per 90 Minutes_npxG+xAG,...,Carries_CPA,Receiving_Rec,Receiving_PrgR,Performance_Fld,Performance_PKwon,Performance_Recov,Aerial Duels_Won,Aerial Duels_Won%,Name,season-end
0,0.08,0.24,0.32,0.08,0.32,,,,,,...,,,,,,,,,Granit Xhaka,2011
1,0.06,0.0,0.06,0.06,0.06,,,,,,...,,,,,,,,,Granit Xhaka,2012
2,0.06,0.0,0.06,0.06,0.06,,,,,,...,,,,,,,,,Granit Xhaka,2013
3,0.0,0.04,0.04,0.0,0.04,,,,,,...,,,,,,,,,Granit Xhaka,2014
4,0.07,0.03,0.1,0.07,0.1,,,,,,...,,,,,,,,,Granit Xhaka,2015


In [64]:
final_df.drop(columns=['Name']).min().sort_values()

Expected_G-xG           -8.7
Expected_A-xAG          -8.7
Expected_np:G-xG        -8.5
Per 90 Minutes_Gls       0.0
Challenges_Tkl%          0.0
                       ...  
Medium_Cmp               0.0
PPA                      0.0
Playing Time_Min         1.0
Standard_Dist            1.2
season-end            2003.0
Length: 99, dtype: float64

In [None]:
# To Make : PK Success %,  TacklesW%

In [None]:
#Divide each metric in types:
    #-Personal metric to filter player
    #-Statistical metric to calculate stats
    #-Team Performance(Positive stats)
    #-Possible cautions

In [None]:
df['age'].isna().sum()

In [None]:
first_seasons = df.groupby('Name').first().reset_index()

In [None]:
first_seasons.drop(columns=['Season', 'Squad', 'Country', 'Comp', 'position', 'Position', 'footed', 'current_club'], inplace=True)

In [None]:
first_seasons.head()

In [None]:
fw_first = first_seasons[first_seasons['FW'] == 1]
mf_first = first_seasons[first_seasons['MF'] == 1]
df_first = first_seasons[first_seasons['DF'] == 1]

In [None]:
fw_first.head()

In [None]:
fw_idx = fw_first[['Name', 'Age']]
mf_idx = mf_first[['Name', 'Age']]
df_idx = df_first[['Name', 'Age']]

In [None]:
fw_first.drop(columns=['Name'], inplace=True)
mf_first.drop(columns=['Name'], inplace=True)
df_first.drop(columns=['Name'], inplace=True)

In [None]:
fw_idx.head()

In [None]:
fw_first.head()

In [None]:
fw_first.shape

In [None]:
min_values = first_seasons.min().tolist()[1:]
max_values = first_seasons.max().tolist()[1:]

In [None]:
len(min_values)

In [None]:
ii = IterativeImputer(n_nearest_features=2*int(np.ceil(np.sqrt(first_seasons.shape[1]))), initial_strategy='median', random_state=10, min_value = min_values, max_value = max_values, max_iter=15)

In [None]:
features = fw_first.columns.tolist()

In [None]:
fw_first = pd.DataFrame(ii.fit_transform(fw_first), columns=features)
mf_first = pd.DataFrame(ii.fit_transform(mf_first), columns=features)
df_first = pd.DataFrame(ii.fit_transform(df_first), columns=features)

In [None]:
mf_first.shape

In [None]:
mf_idx.shape

In [None]:
temp = pd.concat([mf_first, mf_idx], axis=1).drop(columns=['Age'])

In [None]:
temp.head(10)

In [None]:
mf_first.head()

In [None]:
mf_idx.head()

In [None]:
temp.shape

In [None]:
temp.isna().sum()

In [None]:
fw_first = pd.concat([fw_first,fw_idx], axis=1).drop(columns=['Age'])
mf_first = pd.concat([mf_first,mf_idx], axis=1).drop(columns=['Age'])
df_first = pd.concat([df_first,df_idx], axis=1).drop(columns=['Age'])

In [None]:
fw_first['Age'] = fw_idx['Age']
mf_first['Age'] = mf_idx['Age']
df_first['Age'] = df_idx['Age']

In [None]:
df_idx.isna().sum()

In [None]:
mf_first.head()

In [None]:
mf_first.tail()

In [None]:
df_first.shape

In [None]:
df_first.isna().sum()

In [None]:
first_seasons

In [None]:
dtypes = df.dtypes
object_columns = dtypes[dtypes == 'object'].keys().tolist()
print(object_columns)

In [None]:
for col in object_columns:
    print(f"The null values in {col} are : {df[col].isna().sum()}")