In [15]:
# imports
import pandas as pd
import numpy as np
import sklearn
import pyarrow as pa
import pyarrow.parquet as pq

In [16]:
# load in the features of each base position
file_paths  = ['../../data/processed/players_data_processed.parquet', 
               '../../data/processed/players_data_GK.parquet', 
               '../../data/processed/players_data_DF.parquet', 
               '../../data/processed/players_data_MF.parquet', 
               '../../data/processed/players_data_FW.parquet']

df, df_gk, df_df, df_mf, df_fw = [pd.read_parquet(file) for file in file_paths]
base_positions = {'GK':df_gk, 'DF':df_df, 'MF':df_mf, 'FW':df_fw}
df_df.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Fls,Fld_stats_misc,Off_stats_misc,Recov,Won,Lost_stats_misc,Won%,foot,numeric_wage,exposure_score
5,6,Yunis Abdelhamid,ma MAR,DF,Saint-Étienne,fr Ligue 1,36.0,1987.0,16,11,...,16,6.0,,42,19.0,10,65.5,Left,,0.3
8,9,Ali Abdi,tn TUN,"DF,MF",Nice,fr Ligue 1,30.0,1993.0,25,17,...,27,14.0,5.0,72,22.0,14,61.1,Left,41923.0,0.41
10,11,Abel,es ESP,DF,Osasuna,es La Liga,23.0,2000.0,35,20,...,25,32.0,3.0,106,20.0,17,54.1,,,0.61
14,15,Abner,br BRA,DF,Lyon,fr Ligue 1,24.0,2000.0,19,12,...,16,4.0,,45,7.0,9,43.8,Left,49038.0,0.32
16,17,Abdel Abqar,ma MAR,DF,Alavés,es La Liga,25.0,1999.0,29,29,...,47,31.0,2.0,87,40.0,35,53.3,Right,48077.0,0.72


In [17]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

categorical_cols = ['Rk','Player','Nation','Pos','Squad','Comp','Age','Born','MP','Starts','Min','90s',
                    'numeric_wage','foot','W','D','L']

for pos, df_pos in base_positions.items():
    # 1) Ensure Rk is a COLUMN (don’t drop it). If Rk was the index, bring it back:
    if df_pos.index.name == 'Rk' or 'Rk' not in df_pos.columns:
        df_pos = df_pos.reset_index()  # brings index out as a column named 'index' or 'Rk'
        # If it came out as 'index', rename it:
        if 'index' in df_pos.columns and 'Rk' not in df_pos.columns:
            df_pos = df_pos.rename(columns={'index': 'Rk'})

    # 2) Use a clean, contiguous index ONCE for both sides
    df_pos = df_pos.reset_index(drop=True)

    # 3) Split columns
    keep_cats = [c for c in categorical_cols if c in df_pos.columns]
    temp_df = df_pos[keep_cats].copy()

    # numeric = everything else (but drop non-numeric later)
    num_cols = [c for c in df_pos.columns if c not in keep_cats]
    num_df = df_pos[num_cols].apply(pd.to_numeric, errors='coerce')

    # optional: if some numeric cols are entirely NaN after coercion, drop them
    all_nan_cols = num_df.columns[num_df.isna().all()]
    if len(all_nan_cols) > 0:
        num_df = num_df.drop(columns=all_nan_cols)

    # 4) Scale
    scaler = StandardScaler()
    scaled_vals = scaler.fit_transform(num_df.values)  # preserves row count
    scaled_df = pd.DataFrame(scaled_vals, columns=num_df.columns, index=df_pos.index)

    # 5) Assert perfect alignment
    assert len(temp_df) == len(scaled_df), f"Row mismatch: cats={len(temp_df)} vs num={len(scaled_df)}"
    assert temp_df.index.equals(scaled_df.index), "Index mismatch before concat"

    # 6) Concat without re-resetting indexes
    out_df = pd.concat([temp_df, scaled_df], axis=1)

    # 7) (Optional) ensure Rk is intact and unique if you expect that
    # assert out_df['Rk'].notna().all(), "Found null Rk after concat"
    # assert out_df['Rk'].is_unique, "Duplicate Rk values found"

    base_positions[pos] = out_df

    print(pos, "rows:", len(out_df))
    display(out_df.head())

# Save
for pos, df_pos in base_positions.items():
    table = pa.Table.from_pandas(df_pos, preserve_index=False)
    pq.write_table(table, f'../../data/processed/players_data_{pos}_normalized.parquet')


GK rows: 117


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Thr,Launch%,AvgLen,Opp,Stp,Stp%,#OPA,#OPA/90,AvgDist,exposure_score
0,34,Adrián,es ESP,GK,Betis,es La Liga,37.0,1987.0,19,19,...,-0.510301,-0.485041,-0.402487,-0.80357,-0.734569,-0.359649,-0.8788,-0.486002,-0.994634,-0.961708
1,43,Julen Agirrezabala,es ESP,GK,Athletic Club,es La Liga,23.0,2000.0,14,14,...,-1.459132,0.312478,0.167162,-1.640915,-0.258422,2.936153,-0.818425,0.166018,0.075357,-1.625151
2,88,Alisson,br BRA,GK,Liverpool,eng Premier League,31.0,1992.0,28,28,...,0.367368,-1.181821,-1.373,-0.81826,-0.972642,-0.781205,1.113594,0.909019,0.84575,0.05557
3,148,Alphonse Areola,fr FRA,GK,West Ham,eng Premier League,31.0,1993.0,26,25,...,0.153881,0.052235,-0.212604,0.202713,-0.655211,-0.972822,-0.033542,0.044712,0.417754,-0.254036
4,161,Kepa Arrizabalaga,es ESP,GK,Bournemouth,eng Premier League,29.0,1994.0,31,31,...,0.225043,-0.048504,-0.043819,0.004395,-0.099706,-0.12971,1.415472,0.878692,0.84575,0.453636


DF rows: 640


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,xG+/-,xG+/-90,Fls,Fld_stats_misc,Off_stats_misc,Recov,Won,Lost_stats_misc,Won%,exposure_score
0,6,Yunis Abdelhamid,ma MAR,DF,Saint-Étienne,fr Ligue 1,36.0,1987.0,16,11,...,-1.341308,-2.661818,-0.589009,-1.013733,,-1.306905,-0.620143,-1.134971,1.025463,-1.573918
1,9,Ali Abdi,tn TUN,"DF,MF",Nice,fr Ligue 1,30.0,1993.0,25,17,...,0.598113,0.882095,0.422363,-0.316945,1.088486,-0.474868,-0.498063,-0.861921,0.654094,-1.005314
2,11,Abel,es ESP,DF,Osasuna,es La Liga,23.0,2000.0,35,20,...,-0.636724,-0.593199,0.238477,1.250826,0.186114,0.468108,-0.57945,-0.657134,0.063282,0.028511
3,15,Abner,br BRA,DF,Lyon,fr Ligue 1,24.0,2000.0,19,12,...,0.416519,0.78588,-0.589009,-1.18793,,-1.223701,-1.108463,-1.203234,-0.806057,-1.470535
4,17,Abdel Abqar,ma MAR,DF,Alavés,es La Liga,25.0,1999.0,29,29,...,-0.106471,-0.064018,2.26122,1.163728,-0.265072,-0.058849,0.234416,0.571592,-0.00424,0.597115


MF rows: 682


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,xG+/-,xG+/-90,Fls,Fld_stats_misc,Off_stats_misc,Recov,Won,Lost_stats_misc,Won%,exposure_score
0,7,Himad Abdelli,dz ALG,"MF,FW",Angers,fr Ligue 1,24.0,1999.0,32,32,...,-1.886715,-1.265915,1.638119,0.251521,-0.355013,2.482073,-0.404216,-0.250513,-0.249156,1.468933
1,9,Ali Abdi,tn TUN,"DF,MF",Nice,fr Ligue 1,30.0,1993.0,25,17,...,0.613699,0.874013,0.041471,-0.877014,0.171129,-0.548744,0.112767,-0.619785,1.362086,-0.878278
2,16,Zakaria Aboukhlal,ma MAR,"MF,FW",Toulouse,fr Ligue 1,24.0,2000.0,26,22,...,0.440169,0.456058,0.649718,-0.187354,2.275695,-0.824273,0.112767,0.857303,-0.449634,-0.039989
3,26,Tyler Adams,us USA,MF,Bournemouth,eng Premier League,25.0,1999.0,28,21,...,0.479607,0.472777,1.410026,-0.438139,,0.503275,0.629751,-0.373603,1.525438,0.015897
4,30,Karim Adeyemi,de GER,"FW,MF",Dortmund,de Bundesliga,22.0,2002.0,25,17,...,0.944984,1.27525,0.041471,-0.626229,-0.091942,-1.149898,-0.633987,-0.68133,-0.13778,-0.542963


FW rows: 464


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,xG+/-,xG+/-90,Fls,Fld_stats_misc,Off_stats_misc,Recov,Won,Lost_stats_misc,Won%,exposure_score
0,7,Himad Abdelli,dz ALG,"MF,FW",Angers,fr Ligue 1,24.0,1999.0,32,32,...,-1.94186,-1.276413,1.852506,0.153273,-0.749243,4.003379,-0.473924,-0.535966,0.046108,1.617242
1,14,Matthis Abline,fr FRA,FW,Nantes,fr Ligue 1,21.0,2003.0,34,33,...,-1.098078,-0.75022,0.746654,1.476115,-0.441687,0.462641,1.459854,1.52692,0.559906,1.502747
2,16,Zakaria Aboukhlal,ma MAR,"MF,FW",Toulouse,fr Ligue 1,24.0,2000.0,26,22,...,0.362915,0.365947,0.825644,-0.287675,0.788534,-0.245507,-0.132669,0.127104,-0.174091,0.071559
3,19,Tammy Abraham,eng ENG,FW,Milan,it Serie A,26.0,1997.0,28,12,...,0.769181,1.274826,-0.122229,-1.106577,-0.134132,-1.822745,0.398172,-0.057082,1.090015,-1.130638
4,24,Akor Adams,ng NGA,FW,Montpellier,fr Ligue 1,24.0,2000.0,15,13,...,-0.957447,-1.531537,-0.596166,-1.358547,1.249867,-1.436483,0.094834,-0.278105,0.983993,-1.359628
