In [2]:
# imports
import pandas as pd
import numpy as np
import sklearn

In [3]:
# load in the features of each base position
file_paths  = ['../../data/processed/players_data_processed.parquet', 
               '../../data/processed/players_data_GK.parquet', 
               '../../data/processed/players_data_DF.parquet', 
               '../../data/processed/players_data_MF.parquet', 
               '../../data/processed/players_data_FW.parquet']

df, df_gk, df_df, df_mf, df_fw = [pd.read_parquet(file) for file in file_paths]
base_positions = {'GK':df_gk, 'DF':df_df, 'MF':df_mf, 'FW':df_fw}
df_df.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Fls,Fld_stats_misc,Off_stats_misc,Recov,Won,Lost_stats_misc,Won%,foot,numeric_wage,exposure_score
5,6,Yunis Abdelhamid,ma MAR,DF,Saint-Étienne,fr Ligue 1,36.0,1987.0,16,11,...,16,6.0,,42,19.0,10,65.5,Left,,0.3
8,9,Ali Abdi,tn TUN,"DF,MF",Nice,fr Ligue 1,30.0,1993.0,25,17,...,27,14.0,5.0,72,22.0,14,61.1,Left,41923.0,0.41
10,11,Abel,es ESP,DF,Osasuna,es La Liga,23.0,2000.0,35,20,...,25,32.0,3.0,106,20.0,17,54.1,,,0.61
14,15,Abner,br BRA,DF,Lyon,fr Ligue 1,24.0,2000.0,19,12,...,16,4.0,,45,7.0,9,43.8,Left,49038.0,0.32
16,17,Abdel Abqar,ma MAR,DF,Alavés,es La Liga,25.0,1999.0,29,29,...,47,31.0,2.0,87,40.0,35,53.3,Right,48077.0,0.72


In [5]:
# normalise numeric features for each position
from sklearn.preprocessing import StandardScaler
categorical_cols = ['Rk','Player', 'Nation', 'Pos','Squad', 'Comp', 'Age','Born','MP','Starts','Min','90s',
                    'numeric_wage', 'foot', 'W', 'D', 'L']

for pos, df_pos in base_positions.items():
    # copy all the categorical columns that exist in the dataframe
    temp_df = df_pos[[col for col in df_pos.columns if col in categorical_cols]].copy()
    df_pos = df_pos[[col for col in df_pos.columns if col not in categorical_cols]]
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_pos)
    base_positions[pos] = pd.concat([temp_df, pd.DataFrame(scaled_data, columns=df_pos.columns)], axis=1)
    display(base_positions[pos].head())
    
# save the normalised dataframes
for pos, df_pos in base_positions.items():
    df_pos.to_parquet(f'../../data/processed/players_data_{pos}_normalized.parquet', index=False)

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Thr,Launch%,AvgLen,Opp,Stp,Stp%,#OPA,#OPA/90,AvgDist,exposure_score
33,34.0,Adrián,es ESP,GK,Betis,es La Liga,37.0,1987.0,19.0,19.0,...,,,,,,,,,,
42,43.0,Julen Agirrezabala,es ESP,GK,Athletic Club,es La Liga,23.0,2000.0,14.0,14.0,...,,,,,,,,,,
87,88.0,Alisson,br BRA,GK,Liverpool,eng Premier League,31.0,1992.0,28.0,28.0,...,,,,,,,,,,
147,148.0,Alphonse Areola,fr FRA,GK,West Ham,eng Premier League,31.0,1993.0,26.0,25.0,...,0.580855,-1.022317,-0.782253,0.386341,1.011304,0.866696,0.147585,-0.349533,-0.609438,0.940161
160,161.0,Kepa Arrizabalaga,es ESP,GK,Bournemouth,eng Premier League,29.0,1994.0,31.0,31.0,...,0.319926,-0.846024,-0.951038,0.393687,-0.099706,-0.436296,0.992843,0.469284,0.717351,0.98439


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,xG+/-,xG+/-90,Fls,Fld_stats_misc,Off_stats_misc,Recov,Won,Lost_stats_misc,Won%,exposure_score
5,6.0,Yunis Abdelhamid,ma MAR,DF,Saint-Étienne,fr Ligue 1,36.0,1987.0,16.0,11.0,...,-0.51324,-0.320591,0.97402,-0.491142,2.442044,0.717719,2.716708,1.663792,1.101424,1.372483
8,9.0,Ali Abdi,tn TUN,"DF,MF",Nice,fr Ligue 1,30.0,1993.0,25.0,17.0,...,-0.796527,-0.561128,-0.680952,-0.578241,0.186114,1.27241,0.600656,-0.315821,1.320869,0.958953
10,11.0,Abel,es ESP,DF,Osasuna,es La Liga,23.0,2000.0,35.0,20.0,...,1.651356,1.459384,0.514305,0.205645,,1.993509,0.193723,-0.384083,0.983262,0.286967
14,15.0,Abner,br BRA,DF,Lyon,fr Ligue 1,24.0,2000.0,19.0,12.0,...,-1.297725,-1.042202,-1.232609,-0.752437,,0.246231,-0.131823,0.77638,-0.713215,1.010645
16,17.0,Abdel Abqar,ma MAR,DF,Alavés,es La Liga,25.0,1999.0,29.0,29.0,...,0.489157,0.497235,-0.405123,0.118547,0.6373,0.884126,-0.253903,-0.452346,0.417769,-0.02318


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,xG+/-,xG+/-90,Fls,Fld_stats_misc,Off_stats_misc,Recov,Won,Lost_stats_misc,Won%,exposure_score
6,7.0,Himad Abdelli,dz ALG,"MF,FW",Angers,fr Ligue 1,24.0,1999.0,32.0,32.0,...,0.266638,0.305595,-0.566775,-0.061961,-0.355013,-0.573792,-1.036085,-0.619785,-2.246503,-0.319419
8,9.0,Ali Abdi,tn TUN,"DF,MF",Nice,fr Ligue 1,30.0,1993.0,25.0,17.0,...,0.487495,0.823859,-0.794868,-0.438139,0.69727,-0.999609,-0.748872,-0.435149,-1.006515,-1.213594
15,16.0,Zakaria Aboukhlal,ma MAR,"MF,FW",Toulouse,fr Ligue 1,24.0,2000.0,26.0,22.0,...,-1.161043,-1.349506,0.87781,-0.93971,-0.091942,-0.824273,-0.002118,0.857303,-0.612985,-0.542963
25,26.0,Tyler Adams,us USA,MF,Bournemouth,eng Premier League,25.0,1999.0,28.0,21.0,...,-0.537911,-0.463442,1.410026,-0.187354,,1.35491,1.606275,0.36494,1.391786,0.351213
29,30.0,Karim Adeyemi,de GER,"FW,MF",Dortmund,de Bundesliga,22.0,2002.0,25.0,17.0,...,-1.492328,-1.065297,0.953841,1.003878,,1.304814,0.629751,1.041938,0.025572,1.189503


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,xG+/-,xG+/-90,Fls,Fld_stats_misc,Off_stats_misc,Recov,Won,Lost_stats_misc,Won%,exposure_score
6,7.0,Himad Abdelli,dz ALG,"MF,FW",Angers,fr Ligue 1,24.0,1999.0,32.0,32.0,...,-1.644973,-2.344744,-0.991113,-1.547524,0.173423,-1.468671,-0.777261,-0.941176,-0.255646,-1.073391
13,14.0,Matthis Abline,fr FRA,FW,Nantes,fr Ligue 1,21.0,2003.0,34.0,33.0,...,0.448856,0.621071,-0.280208,-0.350667,-1.056798,-0.728335,-0.587675,-0.720153,0.095041,-0.729906
15,16.0,Zakaria Aboukhlal,ma MAR,"MF,FW",Toulouse,fr Ligue 1,24.0,2000.0,26.0,22.0,...,0.316039,0.58918,-1.149092,-1.295554,-0.441687,-1.307729,0.132752,0.016593,0.527284,-1.245133
18,19.0,Tammy Abraham,eng ENG,FW,Milan,it Serie A,26.0,1997.0,28.0,12.0,...,-2.629386,-1.818551,-0.359198,0.657213,-0.441687,1.557051,-0.436007,0.45864,-1.291397,1.502747
23,24.0,Akor Adams,ng NGA,FW,Montpellier,fr Ligue 1,24.0,2000.0,15.0,13.0,...,-1.035575,-0.814001,1.773517,1.665092,0.327201,1.202977,0.019,0.790175,-0.581867,0.758529
