In [118]:
import pandas as pd
RAW_DATA_DIR = '../data/raw'

In [119]:
df_fifa = pd.read_csv(f'{RAW_DATA_DIR}/fifa_data.csv', dtype = str)
# Season = truncacted fifa version year + (fifa version year -1)
df_fifa['fifa_version'] = df_fifa['fifa_version'].astype(float).astype(int)
df_fifa['season'] = df_fifa['fifa_version'].apply(lambda x: f"{x-1}{x}")
#df_fifa.head(10)

In [120]:
def remove_early_seasons(df):
    seasons_to_remove = ['1415', '1516', '1617']
    df_filtered = df[~df['season'].isin(seasons_to_remove)]
    return df_filtered
df_fifa = remove_early_seasons(df_fifa)
#df_fifa.head(10)

In [121]:
# Dob -> year of birth
df_fifa['DOB'] = pd.to_datetime(df_fifa['dob'], errors='coerce')
df_fifa['born'] = df_fifa['DOB'].dt.year.astype('str')


In [122]:
# Keep only FIFA leagues that correspond to FBref leagues
fifa_leagues_to_keep = [
    'Premier League',      # ENG-Premier League
    'La Liga',             # ESP-La Liga
    'Ligue 1',             # FRA-Ligue 1
    'Bundesliga',          # GER-Bundesliga
    'Serie A',             # ITA-Serie A
    'Eredivisie',          # NED-Eredivisie
    'Liga Portugal',       # POR-Primeira Liga
    'Super Lig',           # TUR-Super Lig
    'Major League Soccer', # USA-MLS
    'Jupiler Pro League',  # BEL-Jupiler Pro League
]

df_fifa = df_fifa[df_fifa['league_name'].isin(fifa_leagues_to_keep)]
print(f"Filtered FIFA to {len(df_fifa)} rows in relevant leagues")
#df_fifa.head(10)

Filtered FIFA to 44574 rows in relevant leagues


In [123]:
import unicodedata

def normalize_name(text):
    """Remove accents and lowercase"""
    if pd.isna(text):
        return ""
    # Normalize to NFD (decompose accents), filter out accent marks, recompose
    nfd = unicodedata.normalize('NFD', text)
    without_accents = ''.join(c for c in nfd if unicodedata.category(c) != 'Mn')
    return without_accents.lower()

df_names = pd.read_csv(f'{RAW_DATA_DIR}/fbref_merged_stats.csv', dtype = str)

# 1. Normalize and split names
df_names['name_parts'] = df_names['player'].apply(lambda x: normalize_name(x).split())
df_fifa['long_name_lower_parts'] = df_fifa['long_name'].apply(lambda x: set(normalize_name(x).split()))
df_fifa['short_name_lower_parts'] = df_fifa['short_name'].apply(lambda x: set(normalize_name(x).split()))

# 2. Create a lookup dictionary from FBref
fbref_lookup = {}
for _, row in df_names.iterrows():
    key = (row['season'], row['born'])
    if key not in fbref_lookup:
        fbref_lookup[key] = []
    fbref_lookup[key].append((row['player'], set(row['name_parts'])))

# 3. Fast matching function
def match_name(row):
    key = (row['season'], row['born'])
    candidates = fbref_lookup.get(key)
    
    if not candidates:
        return None
    
    fifa_long_parts = row['long_name_lower_parts']
    fifa_short_parts = row['short_name_lower_parts']
    # Combine both for broader matching
    fifa_all_parts = fifa_long_parts | fifa_short_parts
    
    # Filter out single-letter initials from FIFA name
    fifa_all_parts_filtered = {p for p in fifa_all_parts if len(p) > 1}
    
    for player_name, fb_parts in candidates:
        # Filter out single-letter initials from FBref name
        fb_parts_filtered = {p for p in fb_parts if len(p) > 1}
        
        # Strategy 1: All FBref parts in FIFA (strict)
        if fb_parts_filtered.issubset(fifa_all_parts):
            return player_name
        
        # Strategy 2: At least one substantial part matches (lenient, for Asian names)
        # Match if any meaningful (>2 chars) FBref part is in FIFA
        substantial_fb_parts = {p for p in fb_parts_filtered if len(p) > 2}
        if substantial_fb_parts and any(p in fifa_all_parts_filtered for p in substantial_fb_parts):
            return player_name
    
    return None

# 4. Apply matching
df_fifa['fuzzy_name'] = df_fifa.apply(match_name, axis=1)

# Cleanup temporary columns
df_fifa.drop(columns=['long_name_lower_parts', 'short_name_lower_parts'], inplace=True)
#df_fifa.head(10)

In [124]:
#df_names.head(10)

In [125]:
len(df_names)

44908

In [126]:
# get count of unmatched players
unmatched_count = df_fifa['fuzzy_name'].isnull().sum()
print(f"Number of unmatched players: {unmatched_count}")
print(len(df_fifa))

Number of unmatched players: 9898
44574


In [127]:
# Remove no matches
df_fifa = df_fifa[df_fifa['fuzzy_name'].notnull()]
print(f"After removing unmatched, FIFA has {len(df_fifa)} rows")

After removing unmatched, FIFA has 34676 rows


In [128]:
# Debug: Check Hugo Lloris specifically
test_player = "Hugo Lloris"

# Check in FBref data
fbref_hugo = df_names[df_names['player'].str.contains("Lloris", case=False, na=False)]
print("FBref Hugo Lloris entries:")
print(fbref_hugo[['player', 'season', 'born']].head())

# Check in FIFA data  
fifa_hugo = df_fifa[df_fifa['long_name'].str.contains("Lloris", case=False, na=False)]
print("\nFIFA Hugo Lloris entries:")
print(fifa_hugo[['long_name', 'short_name', 'season', 'born', 'fuzzy_name']].head())

# Check if season/born match
if not fbref_hugo.empty and not fifa_hugo.empty:
    print("\nFBref seasons:", fbref_hugo['season'].unique())
    print("FIFA seasons:", fifa_hugo['season'].unique())
    print("\nFBref born:", fbref_hugo['born'].unique())
    print("FIFA born:", fifa_hugo['born'].unique())

FBref Hugo Lloris entries:
           player season  born
4482  Hugo Lloris   1718  1986
4995  Hugo Lloris   1819  1986
5513  Hugo Lloris   1920  1986
6044  Hugo Lloris   2021  1986
6591  Hugo Lloris   2122  1986

FIFA Hugo Lloris entries:
            long_name short_name season  born      fuzzy_name
3848   Gautier Lloris  G. Lloris   2324  1995  Gautier Lloris
18387     Hugo Lloris  H. Lloris   2223  1986     Hugo Lloris
36922     Hugo Lloris  H. Lloris   2122  1986     Hugo Lloris
56158     Hugo Lloris  H. Lloris   2021  1986     Hugo Lloris
75046     Hugo Lloris  H. Lloris   1920  1986     Hugo Lloris

FBref seasons: ['1718' '1819' '1920' '2021' '2122' '2223' '2324' '2425']
FIFA seasons: ['2324' '2223' '2122' '2021' '1920' '1819' '1718']

FBref born: ['1986' '1995']
FIFA born: ['1995' '1986']
