In [88]:
!pip install fuzzywuzzy
!pip install python-Levenshtein



# Map UEFA score

map score to each fbref data by chooing best either team score or country part

In [89]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process
from glob import glob
import os

# --- Display setup ---
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# --- Paths ---
data_path = "../data/fbref/"
modified_path = "../data/fbref/modified"
uefa_data_path = "../data/uefa/"
os.makedirs(modified_path, exist_ok=True)

# --- Preload ranking data once ---
def load_uefa_ranking(year):
    rankingdf = pd.read_csv(os.path.join(uefa_data_path, f'UEFA_Ranking_{year}.csv'))
    rankingdf['used_point'] = np.where(
        rankingdf['Total Points'] >= rankingdf['Country Part'],
        rankingdf['Total Points'],
        rankingdf['Country Part']
    )

    choices = rankingdf['team'].unique()
    return rankingdf,choices

# --- Matching function ---
def match_team(name,rankingdf,choices):
    if pd.isna(name): 
        return np.nan
    match, score = process.extractOne(name, choices)
    if score > 70:
        return rankingdf.loc[rankingdf['team'] == match, 'used_point'].iloc[0]
    return 0

# --- Outfield Players ---
fbref_files = glob(f"{data_path}/**/*.csv", recursive=True)

for file in fbref_files:
    subfolder,filename = file.split('/')[-2:]
    year = '20'+file.split('/')[-1][-6:-4]
    print(f"⚙️ Processing: {file}", f'\nyear is {year}')
    rankingdf,choices = load_uefa_ranking(year)

    df = pd.read_csv(os.path.join(file))
    
    # Map UEFA coefficient
    df['team_score'] = df['team'].apply(lambda x: match_team(x, rankingdf, choices))
    #df_encoded.drop(columns=['team'], inplace=True)

    # Save cleaned file
    os.makedirs(os.path.join(modified_path,subfolder), exist_ok=True)
    output_file = os.path.join(modified_path, subfolder, filename)
    df.to_csv(output_file, index=False)
    print(f"Saved to: {output_file}")

⚙️ Processing: ../data/fbref/PL_team/PL_team_22_23.csv 
year is 2023
Saved to: ../data/fbref/modified/PL_team/PL_team_22_23.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_24_25.csv 
year is 2025
Saved to: ../data/fbref/modified/PL_team/PL_team_24_25.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_20_21.csv 
year is 2021
Saved to: ../data/fbref/modified/PL_team/PL_team_20_21.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_21_22.csv 
year is 2022
Saved to: ../data/fbref/modified/PL_team/PL_team_21_22.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_23_24.csv 
year is 2024
Saved to: ../data/fbref/modified/PL_team/PL_team_23_24.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_25_26.csv 
year is 2026
Saved to: ../data/fbref/modified/PL_team/PL_team_25_26.csv
⚙️ Processing: ../data/fbref/modified/PL_team/PL_team_22_23.csv 
year is 2023
Saved to: ../data/fbref/modified/PL_team/PL_team_22_23.csv
⚙️ Processing: ../data/fbref/modified/PL_team/PL_team_24_25.csv 
year is 2025
Saved to: ../data

# Feature engineering : outfield

In [90]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [91]:
original_df = pd.read_csv('../data/fbref/modified/PL_outfield/PL_outfield_25_26.csv')
original_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ranker,450.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
birth_year,450.0,1999.302222,4.114967,1986.0,1997.0,1999.5,2002.0,2009.0
games,450.0,6.815556,2.98761,1.0,4.0,8.0,10.0,10.0
games_starts,450.0,4.888889,3.576228,0.0,2.0,5.0,8.0,10.0
minutes,450.0,439.013333,298.906649,1.0,174.25,414.0,703.25,900.0
minutes_90s,450.0,4.875556,3.320568,0.0,1.9,4.6,7.8,10.0
goals,450.0,0.568889,1.173543,0.0,0.0,0.0,1.0,13.0
assists,450.0,0.393333,0.720956,0.0,0.0,0.0,1.0,4.0
goals_assists,450.0,0.962222,1.525111,0.0,0.0,0.0,1.0,14.0
goals_pens,450.0,0.528889,1.094758,0.0,0.0,0.0,1.0,13.0


In [None]:
#drop cols
df = original_df.copy()
if 'ranker' in df.columns:
    df.drop(columns=['ranker'], inplace=True)
else: print(f'ranker doesnt exist in {file}')

#drop non per90 cols if it already have per90 cols
per90_cols = [c for c in df.columns if c.endswith('_per90')]
base = {c.replace('_per90', '') for c in per90_cols}
cols_to_drop = [c for c in df.columns if c in base]
df = df.drop(columns=cols_to_drop)

#Trim age
if df['age'].dtype == 'object':
    df['age'] = df['age'].str.split('-').str[0].str.strip()


# Encode positions
if 'position' in df.columns:
    df_encoded = (
        df
        .assign(
            position_fw=df['position'].str.contains('FW', na=False).astype(int),
            position_mf=df['position'].str.contains('MF', na=False).astype(int),
            position_df=df['position'].str.contains('DF', na=False).astype(int),
        )
        .drop(columns=['position'])
    )
else: print(f'position doesnt exist in {file}')


print('per90 cols :',sorted(per90_cols))

print('removed cols :',sorted(set(original_df.columns) - set(df_encoded.columns)))

print('len is equal' if len(per90_cols) == len(set(original_df.columns) - set(df_encoded.columns)) else 'len is not equal')

per90_cols = [c for c in df.columns if c.endswith('_per90')]
exclude_cols = [
    'ranker',
    'age',
    'birth_year',
    'minutes',
    'minutes_90s',
    'games',
    'games_starts',
    'position_fw',
    'position_mf',
    'position_df',
    'team',
    'team_score'
] + per90_cols


to_scale = [
    c for c in df.columns
    if c not in exclude_cols 
    and c not in ['minutes_90s']
    and df[c].dtype != 'object'
]

df_scaled = df_encoded.copy()
df_scaled[to_scale] = df_encoded[to_scale].div(df['minutes_90s'], axis=0)
df_scaled.replace([np.inf, -np.inf], 0, inplace=True)
df_scaled.rename(columns={c: f"{c}_per90" for c in to_scale}, inplace=True)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (3185788830.py, line 34)

In [None]:
per90_cols = [c for c in df.columns if c.endswith('_per90')]
exclude_cols = [
    'ranker',
    'age',
    'birth_year',
    'minutes',
    'minutes_90s',
    'games',
    'games_starts',
    'position_fw',
    'position_mf',
    'position_df',
    'team',
    'team_score'
] + per90_cols


to_scale = [
    c for c in df.columns
    if c not in exclude_cols 
    and c not in ['minutes_90s']
    and df[c].dtype != 'object'
]

df_scaled = df_encoded.copy()
df_scaled[to_scale] = df_encoded[to_scale].div(df['minutes_90s'], axis=0)
df_scaled.replace([np.inf, -np.inf], 0, inplace=True)
df_scaled.rename(columns={c: f"{c}_per90" for c in to_scale}, inplace=True)



In [None]:
df_scaled.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
birth_year,450.0,1999.302222,4.114967,1986.0,1997.0,1999.5,2002.0,2009.0
games,450.0,6.815556,2.98761,1.0,4.0,8.0,10.0,10.0
games_starts,450.0,4.888889,3.576228,0.0,2.0,5.0,8.0,10.0
minutes,450.0,439.013333,298.906649,1.0,174.25,414.0,703.25,900.0
minutes_90s,450.0,4.875556,3.320568,0.0,1.9,4.6,7.8,10.0
pens_made_per90,437.0,0.007291,0.045554,0.0,0.0,0.0,0.0,0.5
pens_att_per90,437.0,0.008329,0.048167,0.0,0.0,0.0,0.0,0.5
cards_yellow_per90,437.0,0.168874,0.269152,0.0,0.0,0.0,0.27027,2.5
cards_red_per90,437.0,0.004173,0.026643,0.0,0.0,0.0,0.0,0.227273
progressive_carries_per90,437.0,1.711589,1.877332,0.0,0.363636,1.22807,2.4,13.333333


In [None]:
modified_fbref_files = glob(f"{modified_path}/PL_outfield/*.csv", recursive=True)

for file in modified_fbref_files:
    subfolder,filename = file.split('/')[-2:]
    year = '20'+file.split('/')[-1][-6:-4]
    print(f"⚙️ Processing: {file}", f'\nyear is {year}')

    original_df = pd.read_csv(file)
        #drop cols

    df = original_df.copy()
    if 'ranker' in df.columns:
        df.drop(columns=['ranker'], inplace=True)
    else: print(f'ranker doesnt exist in {file}')

    #drop non per90 cols if it already have per90 cols
    per90_cols = [c for c in df.columns if c.endswith('_per90')]
    base = {c.replace('_per90', '') for c in per90_cols}
    cols_to_drop = [c for c in df.columns if c in base]
    df = df.drop(columns=cols_to_drop)

    #Trim age
    if df['age'].dtype == 'object':
        df['age'] = df['age'].str.split('-').str[0].str.strip()


    # Encode positions
    if 'position' in df.columns:
        df_encoded = (
            df
            .assign(
                position_fw=df['position'].str.contains('FW', na=False).astype(int),
                position_mf=df['position'].str.contains('MF', na=False).astype(int),
                position_df=df['position'].str.contains('DF', na=False).astype(int),
            )
            .drop(columns=['position'])
        )
    else: print(f'position doesnt exist in {file}')


    print('per90 cols :',sorted(per90_cols))

    print('removed cols :',sorted(set(original_df.columns) - set(df_encoded.columns)))

    print('len is equal' if len(per90_cols) == len(set(original_df.columns) - set(df_encoded.columns)) else 'len is not equal')

    per90_cols = [c for c in df.columns if c.endswith('_per90')]
    exclude_cols = [
        'ranker',
        'age',
        'birth_year',
        'minutes',
        'minutes_90s',
        'games',
        'games_starts',
        'position_fw',
        'position_mf',
        'position_df',
        'team',
        'team_score'
    ] + per90_cols


    to_scale = [
        c for c in df.columns
        if c not in exclude_cols 
        and c not in ['minutes_90s']
        and df[c].dtype != 'object'
    ]

    df_scaled = df_encoded.copy()
    df_scaled[to_scale] = df_encoded[to_scale].div(df['minutes_90s'], axis=0)
    df_scaled.replace([np.inf, -np.inf], 0, inplace=True)
    df_scaled.rename(columns={c: f"{c}_per90" for c in to_scale}, inplace=True)
    df_scaled.to_csv(file,index = True)
    print(f'saved modified version to {file}')
        

⚙️ Processing: ../data/fbref/modified/PL_outfield/PL_outfield_21_22.csv 
year is 2022
ranker doesnt exist in ../data/fbref/modified/PL_outfield/PL_outfield_21_22.csv
position doesnt exist in ../data/fbref/modified/PL_outfield/PL_outfield_21_22.csv
per90 cols : ['aerials_lost_per90', 'aerials_won_pct_per90', 'aerials_won_per90', 'assisted_shots_per90', 'assists_per90', 'average_shot_distance_per90', 'ball_recoveries_per90', 'blocked_passes_per90', 'blocked_shots_per90', 'blocks_per90', 'cards_red_per90', 'cards_yellow_per90', 'cards_yellow_red_per90', 'carries_distance_per90', 'carries_into_final_third_per90', 'carries_into_penalty_area_per90', 'carries_per90', 'carries_progressive_distance_per90', 'challenge_tackles_pct_per90', 'challenge_tackles_per90', 'challenges_lost_per90', 'challenges_per90', 'clearances_per90', 'corner_kicks_in_per90', 'corner_kicks_out_per90', 'corner_kicks_per90', 'corner_kicks_straight_per90', 'crosses_into_penalty_area_per90', 'crosses_per90', 'dispossessed_

KeyError: "None of [Index(['Unnamed: 0'], dtype='object')] are in the [columns]"