In [1]:
!pip install fuzzywuzzy
!pip install python-Levenshtein

Collecting fuzzywuzzy
  Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting python-Levenshtein
  Using cached python_levenshtein-0.27.3-py3-none-any.whl.metadata (3.9 kB)
Collecting Levenshtein==0.27.3 (from python-Levenshtein)
  Downloading levenshtein-0.27.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.7 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.3->python-Levenshtein)
  Downloading rapidfuzz-3.14.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Using cached python_levenshtein-0.27.3-py3-none-any.whl (9.5 kB)
Downloading levenshtein-0.27.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (153 kB)
Downloading rapidfuzz-3.14.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

# Map UEFA score

map score to each fbref data by chooing best either team score or country part

In [27]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process
from glob import glob
import os

# --- Display setup ---
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# --- Paths ---
data_path = "../data/fbref/"
save_path = "../data/fbref/modified"
uefa_data_path = "../data/uefa/"
os.makedirs(save_path, exist_ok=True)

# --- Preload ranking data once ---
def load_uefa_ranking(year):
    rankingdf = pd.read_csv(os.path.join(uefa_data_path, f'UEFA_Ranking_{year}.csv'))
    rankingdf['used_point'] = np.where(
        rankingdf['Total Points'] >= rankingdf['Country Part'],
        rankingdf['Total Points'],
        rankingdf['Country Part']
    )

    choices = rankingdf['team'].unique()
    return rankingdf,choices

# --- Matching function ---
def match_team(name,rankingdf,choices):
    if pd.isna(name): 
        return np.nan
    match, score = process.extractOne(name, choices)
    if score > 70:
        return rankingdf.loc[rankingdf['team'] == match, 'used_point'].iloc[0]
    return 0

# --- Outfield Players ---
fbref_files = glob(f"{data_path}/**/*.csv", recursive=True)

for file in fbref_files:
    subfolder,filename = file.split('/')[-2:]
    year = '20'+file.split('/')[-1][-6:-4]
    print(f"⚙️ Processing: {file}", f'\nyear is {year}')
    rankingdf,choices = load_uefa_ranking(year)

    df = pd.read_csv(os.path.join(file))
    
    # Map UEFA coefficient
    df['team_score'] = df['team'].apply(lambda x: match_team(x, rankingdf, choices))
    #df_encoded.drop(columns=['team'], inplace=True)

    # Save cleaned file
    os.makedirs(os.path.join(save_path,subfolder), exist_ok=True)
    output_file = os.path.join(save_path, subfolder, filename)
    df.to_csv(output_file, index=False)
    print(f"Saved to: {output_file}")

⚙️ Processing: ../data/fbref/PL_team/PL_team_22_23.csv 
year is 2023
Saved to: ../data/fbref/modified/PL_team/PL_team_22_23.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_24_25.csv 
year is 2025
Saved to: ../data/fbref/modified/PL_team/PL_team_24_25.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_20_21.csv 
year is 2021
Saved to: ../data/fbref/modified/PL_team/PL_team_20_21.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_21_22.csv 
year is 2022
Saved to: ../data/fbref/modified/PL_team/PL_team_21_22.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_23_24.csv 
year is 2024
Saved to: ../data/fbref/modified/PL_team/PL_team_23_24.csv
⚙️ Processing: ../data/fbref/PL_team/PL_team_25_26.csv 
year is 2026
Saved to: ../data/fbref/modified/PL_team/PL_team_25_26.csv
⚙️ Processing: ../data/fbref/modified/PL_team/PL_team_22_23.csv 
year is 2023
Saved to: ../data/fbref/modified/PL_team/PL_team_22_23.csv
⚙️ Processing: ../data/fbref/modified/PL_team/PL_team_24_25.csv 
year is 2025
Saved to: ../data

KeyboardInterrupt: 

# Encode position

one player can play various position in the field, one hot encoder helps

In [None]:
#drop cols
if 'ranker' in df.columns:
    df.drop(columns=['ranker'], inplace=True)
else: print(f'ranker doesnt exist in {file}')


# Encode positions
df_encoded = df.copy()
if 'position' in df.columns:
    df_encoded = (
        df_encoded
        .assign(
            position_fw=df['position'].str.contains('FW', na=False).astype(int),
            position_mf=df['position'].str.contains('MF', na=False).astype(int),
            position_df=df['position'].str.contains('DF', na=False).astype(int),
        )
        .drop(columns=['position'])
    )
else: print(f'position doesnt exist in {file}')

#Trim age
if df['age'].dtype == 'object':
    df_encoded['age'] = df_encoded['age'].str.split('-').str[0].str.strip()