In [110]:
!pip install fuzzywuzzy
!pip install python-Levenshtein



# Map UEFA score

map score to each fbref data by chooing best either team score or country part

In [111]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process
from glob import glob
import os

# --- Display setup ---
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# --- Paths ---
data_path = "../data/fbref/"
modified_path = "../data/fbref/modified"
uefa_data_path = "../data/uefa/"
os.makedirs(modified_path, exist_ok=True)

# --- Preload ranking data once ---
def load_uefa_ranking(year):
    rankingdf = pd.read_csv(os.path.join(uefa_data_path, f'UEFA_Ranking_{year}.csv'))
    rankingdf['used_point'] = np.where(
        rankingdf['Total Points'] >= rankingdf['Country Part'],
        rankingdf['Total Points'],
        rankingdf['Country Part']
    )

    choices = rankingdf['team'].unique()
    return rankingdf,choices

# --- Matching function ---
def match_team(name,rankingdf,choices):
    if pd.isna(name): 
        return np.nan
    match, score = process.extractOne(name, choices)
    if score > 70:
        return rankingdf.loc[rankingdf['team'] == match, 'used_point'].iloc[0]
    return 0

# --- Outfield Players ---
fbref_files = glob(f"{data_path}/**/*.csv", recursive=True)

for file in fbref_files:
    subfolder,filename = file.split('/')[-2:]
    year = '20'+file.split('/')[-1][-6:-4]
    print(f"⚙️ Processing: {file}", f'\nyear is {year}')
    rankingdf,choices = load_uefa_ranking(year)

    df = pd.read_csv(os.path.join(file))
    
    # Map UEFA coefficient
    df['team_score'] = df['team'].apply(lambda x: match_team(x, rankingdf, choices))
    #df_encoded.drop(columns=['team'], inplace=True)

    # Save cleaned file
    os.makedirs(os.path.join(modified_path,subfolder), exist_ok=True)
    output_file = os.path.join(modified_path, subfolder, filename)
    df.to_csv(output_file, index=False)
    print(f"Saved to: {output_file}")

⚙️ Processing: ../data/fbref/PL_team/PL_team_22_23.csv 
year is 2023
Saved to: ../data/fbref/modified/PL_team/PL_team_22_23.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_24_25.csv 
year is 2025
Saved to: ../data/fbref/modified/PL_team/PL_team_24_25.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_20_21.csv 
year is 2021
Saved to: ../data/fbref/modified/PL_team/PL_team_20_21.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_21_22.csv 
year is 2022
Saved to: ../data/fbref/modified/PL_team/PL_team_21_22.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_23_24.csv 
year is 2024
Saved to: ../data/fbref/modified/PL_team/PL_team_23_24.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_25_26.csv 
year is 2026
Saved to: ../data/fbref/modified/PL_team/PL_team_25_26.csv
⚙️ Processing: ../data/fbref/modified/PL_team/PL_team_22_23.csv 
year is 2023
Saved to: ../data/fbref/modified/PL_team/PL_team_22_23.csv
⚙️ Processing: ../data/fbref/modified/PL_team/PL_team_24_25.csv 
year is 2025
Saved to: ../data

# Feature engineering : outfield

trim age, encode position and make stats to per90

In [112]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [113]:
modified_fbref_files = glob(f"{modified_path}/PL_outfield/*.csv", recursive=True)

for file in modified_fbref_files:
    subfolder,filename = file.split('/')[-2:]
    year = '20'+file.split('/')[-1][-6:-4]
    print(f"⚙️ Processing: {file}", f'\nyear is {year}')

    original_df = pd.read_csv(file)
        #drop cols

    df = original_df.copy()
    if 'ranker' in df.columns:
        df.drop(columns=['ranker'], inplace=True)
    else: print(f'ranker doesnt exist in {file}')

    #drop non per90 cols if it already have per90 cols
    per90_cols = [c for c in df.columns if c.endswith('_per90')]
    base = {c.replace('_per90', '') for c in per90_cols}
    cols_to_drop = [c for c in df.columns if c in base]
    df = df.drop(columns=cols_to_drop)

    #Trim age
    if df['age'].dtype == 'object':
        df['age'] = df['age'].str.split('-').str[0].str.strip()


    # Encode positions
    if 'position' in df.columns:
        df_encoded = (
            df
            .assign(
                position_fw=df['position'].str.contains('FW', na=False).astype(int),
                position_mf=df['position'].str.contains('MF', na=False).astype(int),
                position_df=df['position'].str.contains('DF', na=False).astype(int),
            )
            .drop(columns=['position'])
        )
    else: print(f'position doesnt exist in {file}')


    print('per90 cols :',sorted(per90_cols))

    print('removed cols :',sorted(set(original_df.columns) - set(df_encoded.columns)))

    print('len is equal' if len(per90_cols) == len(set(original_df.columns) - set(df_encoded.columns)) else 'len is not equal')

    per90_cols = [c for c in df.columns if c.endswith('_per90')]
    exclude_cols = [
        'ranker',
        'age',
        'birth_year',
        'minutes',
        'minutes_90s',
        'games',
        'games_starts',
        'position_fw',
        'position_mf',
        'position_df',
        'team',
        'team_score'
    ] + per90_cols


    to_scale = [
        c for c in df.columns
        if c not in exclude_cols 
        and c not in ['minutes_90s']
        and df[c].dtype != 'object'
    ]

    df_scaled = df_encoded.copy()
    df_scaled[to_scale] = df_encoded[to_scale].div(df['minutes_90s'], axis=0)
    df_scaled.replace([np.inf, -np.inf], 0, inplace=True)
    df_scaled.rename(columns={c: f"{c}_per90" for c in to_scale}, inplace=True)
    df_scaled.to_csv(file,index = True)
    print(f'saved modified version to {file}')
        

⚙️ Processing: ../data/fbref/modified/PL_outfield/PL_outfield_21_22.csv 
year is 2022
per90 cols : ['assists_per90', 'gca_per90', 'goals_assists_pens_per90', 'goals_assists_per90', 'goals_pens_per90', 'goals_per90', 'npxg_per90', 'npxg_xg_assist_per90', 'sca_per90', 'shots_on_target_per90', 'shots_per90', 'xg_assist_per90', 'xg_per90', 'xg_xg_assist_per90']
removed cols : ['assists', 'gca', 'goals', 'goals_assists', 'goals_pens', 'npxg', 'npxg_xg_assist', 'position', 'ranker', 'sca', 'shots', 'shots_on_target', 'xg', 'xg_assist']
len is equal
saved modified version to ../data/fbref/modified/PL_outfield/PL_outfield_21_22.csv
⚙️ Processing: ../data/fbref/modified/PL_outfield/PL_outfield_23_24.csv 
year is 2024
per90 cols : ['assists_per90', 'gca_per90', 'goals_assists_pens_per90', 'goals_assists_per90', 'goals_pens_per90', 'goals_per90', 'npxg_per90', 'npxg_xg_assist_per90', 'sca_per90', 'shots_on_target_per90', 'shots_per90', 'xg_assist_per90', 'xg_per90', 'xg_xg_assist_per90']
removed 

# Feature engineering : outfield

trim age and make stats to per90

In [114]:
modified_fbref_files = glob(f"{modified_path}/PL_keeper/*.csv", recursive=True)

for file in modified_fbref_files:
    subfolder,filename = file.split('/')[-2:]
    year = '20'+file.split('/')[-1][-6:-4]
    print(f"⚙️ Processing: {file}", f'\nyear is {year}')

    original_df_keeper = pd.read_csv(file)
    df_keeper = original_df_keeper.copy()

    # drop cols
    if 'matches' in df_keeper.columns:
        df_keeper.drop(columns=['matches'], inplace=True)

    if 'ranker' in df_keeper.columns:
        df_keeper.drop(columns=['ranker'], inplace=True)

    # drop base cols if per90 exists
    per90_cols = [c for c in df_keeper.columns if c.endswith('_per90')]
    base = {c.replace('_per90', '') for c in per90_cols}
    df_keeper.drop(columns=[c for c in df_keeper.columns if c in base], inplace=True)

    # age cleanup
    if df_keeper['age'].dtype == 'object':
        df_keeper['age'] = df_keeper['age'].str.split('-').str[0].astype(int)

    # exclude list
    exclude_cols = [
        'player','nationality','position','team','age','birth_year',
        'gk_games','gk_games_starts','gk_minutes','team_score',
        'minutes_90s'
    ] + per90_cols

    to_scale = [
        c for c in df_keeper.columns
        if c not in exclude_cols
        and df_keeper[c].dtype != 'object'
    ]

    # scale
    df_keeper_scaled = df_keeper.copy()
    df_keeper_scaled[to_scale] = df_keeper_scaled[to_scale].div(
        df_keeper_scaled['minutes_90s'].replace(0, np.nan), axis=0
    )

    df_keeper_scaled.rename(
        columns={c: f"{c}_per90" for c in to_scale},
        inplace=True
    )

    df_keeper_scaled.replace([np.inf, -np.inf], 0, inplace=True)

    df_keeper_scaled.to_csv(file, index=True)

⚙️ Processing: ../data/fbref/modified/PL_keeper/PL_keeper_21_22.csv 
year is 2022
⚙️ Processing: ../data/fbref/modified/PL_keeper/PL_keeper_20_21.csv 
year is 2021
⚙️ Processing: ../data/fbref/modified/PL_keeper/PL_keeper_22_23.csv 
year is 2023
⚙️ Processing: ../data/fbref/modified/PL_keeper/PL_keeper_25_26.csv 
year is 2026
⚙️ Processing: ../data/fbref/modified/PL_keeper/PL_keeper_24_25.csv 
year is 2025
⚙️ Processing: ../data/fbref/modified/PL_keeper/PL_keeper_23_24.csv 
year is 2024


In [117]:
df_keeper_scaled.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,40.0,28.625,3.705073,20.0,26.0,29.5,30.25,38.0
birth_year,40.0,1994.025,3.519087,1985.0,1992.0,1993.0,1996.25,2002.0
gk_games,40.0,19.4,13.028175,1.0,8.25,17.5,32.0,38.0
gk_games_starts,40.0,19.0,13.347428,0.0,5.75,17.5,32.0,38.0
minutes_90s,40.0,19.0,13.060255,0.0,6.825,17.5,31.175,38.0
gk_goals_against_per90,40.0,2.24525,3.370139,0.0,1.33,1.645,2.0525,22.5
gk_shots_on_target_against_per90,39.0,5.035057,1.569566,2.15625,4.021055,4.821918,5.864457,10.0
gk_saves_per90,39.0,3.373695,1.455863,1.40625,2.808451,3.166667,3.911946,10.0
gk_save_pct_per90,39.0,20.156673,79.522292,1.674603,2.178083,3.452381,9.125,500.0
gk_wins_per90,39.0,0.362642,0.22259,0.0,0.22739,0.342105,0.513158,0.833333
