In [136]:
import pandas as pd
RAW_DATA_DIR = '../data/raw'

In [137]:
df_25 = pd.read_csv(f'{RAW_DATA_DIR}/fifa_25.csv', dtype = str)
df_25_2 = pd.read_csv(f'{RAW_DATA_DIR}/fifa_25_2.csv', dtype = str)
# Remove 'gender' == 'F' for df_25_2
df_25_2 = df_25_2[df_25_2['gender'] != 'F']
df_25_2 = df_25_2.drop(columns=['gender', 'rank'])
df_25_2.rename(columns={'player_name': 'name'}, inplace=True)
#df_25_2.head(10)

In [138]:
cols_to_keep = ['name','full_name', 'description', 'image', 'height_cm', 'weight_kg', 'dob', 'positions', 'overall_rating', 'potential','preferred_foot', 'weak_foot', 'skill_moves', 'international_reputation', 'body_type', 'real_face', 'specialities', 'release_clause', 'club_id', 'club_name', 'club_league_id', 'club_league_name', 'club_logo', 'club_rating', 'club_position', 'club_kit_number', 'club_joined', 'club_contract_valid_until', 'country_id', 'country_name', 'country_league_id', 'country_league_name', 'country_flag', 'country_rating', 'country_position', 'country_kit_number']
# Keep only relevant columns
df_25 = df_25[cols_to_keep]

In [139]:
# Use description of df_25 to create 'name' column (short name before parenthesis)
df_25['name'] = df_25['description'].str.split('(').str[0].str.strip()
#df_25.head(10)

In [140]:
import re
import unicodedata

def extract_clean_name(row):
    """
    Extract clean name from description or full_name.
    If description has format: "Name (Full Name, born ...)", use Full Name
    Otherwise, use full_name field
    Remove accents and special characters
    """
    description = row['description']
    full_name = row['full_name']
    
    # Check if description has pattern: "Name (Full Name, born"
    # Match: (Full Name, born OR (Full Name Surname, born
    match = re.search(r'\(([^,]+),\s*born', description)
    
    if match:
        # Use the name from parentheses
        name = match.group(1).strip()
    else:
        # No name in parentheses (like "Rodri (born"), use full_name
        name = full_name
    
    # Remove accents and normalize
    # Decompose unicode characters and remove combining characters
    name = unicodedata.normalize('NFD', name)
    name = ''.join(char for char in name if unicodedata.category(char) != 'Mn')
    
    return name.strip()

# Test with examples
test_data = pd.DataFrame({
    'description': [
        'Mohamed Salah (Mohamed Salah Hamed Ghaly, born 15 June 1992) is an Egyptian footballer...',
        'Rodri (born 22 June 1996) is a Spanish footballer...'
    ],
    'full_name': [
        'Mohamed Salah Hamed Ghalyمحمد صلاح',
        'Rodrigo Hernández Cascante'
    ]
})

print("Test results:")
print(test_data.apply(extract_clean_name, axis=1))
print("\nExpected: Mohamed Salah Hamed Ghaly, Rodrigo Hernandez Cascante")

Test results:
0     Mohamed Salah Hamed Ghaly
1    Rodrigo Hernandez Cascante
dtype: object

Expected: Mohamed Salah Hamed Ghaly, Rodrigo Hernandez Cascante


In [141]:
# Apply to df_25
df_25['full_name'] = df_25.apply(extract_clean_name, axis=1)
#df_25.head(10)

In [142]:
# Use nationality of df_25_2 to fill in missing country_name in df_25
df_25_2_subset = df_25_2[['name', 'nationality']].rename(columns={'nationality': 'country_name'})
df_25 = df_25.merge(df_25_2_subset, on='name', how='left', suffixes=('', '_from_25_2'))
df_25['country_name'] = df_25['country_name'].combine_first(df_25['country_name_from_25_2'])
df_25 = df_25.drop(columns=['country_name_from_25_2'])
# Change dob to year 'born'
df_25['born'] = pd.to_datetime(df_25['dob'], errors='coerce').dt.year.astype('str')
df_25 = df_25.drop(columns=['dob'])
#df_25.head(10)

In [143]:
df_25 = df_25.drop_duplicates()
df_25 = df_25.drop_duplicates(subset=['name', 'club_name', 'born'], keep='first')
df_25['fifa_version'] = '25.0'
#df_25.head(10)

In [144]:
# Columns in df_25_2 not in df_25
cols = set(df_25_2.columns) - set(df_25.columns)
print(cols)
#df_25_2.head(10)

{'pac+-', 'def', 'pas', 'ovr', 'sho+-', 'alt_position', 'sho', 'phy', 'dri+-', 'dri', 'pas+-', 'club', 'age', 'phy+-', 'image_url', 'pac', 'def+-', 'preffered_foot', 'league', 'weight_(in kg)', 'nationality', 'height_(in cm)', 'position'}


In [145]:
# Columns in df_25_2 to add to df_25
cols_to_add = ['pac', 'def', 'pas', 'dri', 'sho', 'phy', 'age', 'weight_(in kg)', 'height_(in cm)', 'position']
df_25_2_subset = df_25_2[['name'] + cols_to_add]
df_25 = df_25.merge(df_25_2_subset, on='name', how='left')
#df_25 is now updated with additional columns from df_25_2
#df_25.head(10)

In [146]:
# Check column names to see what we're working with
print("FIFA columns:", df_fifa.columns.tolist())
print("\nFIFA 25 columns:", df_25.columns.tolist())

# Check a few rows to confirm full_name matches long_name
print("\nSample FIFA long_name:", df_fifa['long_name'].head(3).tolist())
print("Sample FIFA 25 full_name:", df_25['full_name'].head(3).tolist())

FIFA columns: ['player_id', 'player_url', 'fifa_version', 'fifa_update', 'update_as_of', 'short_name', 'long_name', 'player_positions', 'overall', 'potential', 'value_eur', 'wage_eur', 'age', 'dob', 'height_cm', 'weight_kg', 'club_team_id', 'club_name', 'league_id', 'league_name', 'league_level', 'club_position', 'club_jersey_number', 'club_loaned_from', 'club_joined_date', 'club_contract_valid_until_year', 'nationality_id', 'nationality_name', 'nation_team_id', 'nation_position', 'nation_jersey_number', 'preferred_foot', 'weak_foot', 'skill_moves', 'international_reputation', 'work_rate', 'body_type', 'real_face', 'release_clause_eur', 'player_tags', 'player_traits', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed']

FIFA 25 columns: ['name', 'full_name', 'description', 'image', 'height_cm', 'weight_kg', 'positions', 'overall_rat

In [147]:
# Standardize FIFA 25 columns to match FIFA schema
df_25 = df_25.rename(columns={
    'name': 'short_name',
    'full_name': 'long_name'
})

# Add fifa_version column
df_25['fifa_version'] = '25'

# Check that columns align
print("FIFA columns:", sorted(df_fifa.columns.tolist()))
print("\nFIFA 25 columns:", sorted(df_25.columns.tolist()))

# Find columns that exist in one but not the other
fifa_only = set(df_fifa.columns) - set(df_25.columns)
fifa25_only = set(df_25.columns) - set(df_fifa.columns)

if fifa_only:
    print(f"\nColumns only in FIFA: {fifa_only}")
if fifa25_only:
    print(f"\nColumns only in FIFA 25: {fifa25_only}")

FIFA columns: ['age', 'body_type', 'club_contract_valid_until_year', 'club_jersey_number', 'club_joined_date', 'club_loaned_from', 'club_name', 'club_position', 'club_team_id', 'defending', 'dob', 'dribbling', 'fifa_update', 'fifa_version', 'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed', 'height_cm', 'international_reputation', 'league_id', 'league_level', 'league_name', 'long_name', 'nation_jersey_number', 'nation_position', 'nation_team_id', 'nationality_id', 'nationality_name', 'overall', 'pace', 'passing', 'physic', 'player_id', 'player_positions', 'player_tags', 'player_traits', 'player_url', 'potential', 'preferred_foot', 'real_face', 'release_clause_eur', 'shooting', 'short_name', 'skill_moves', 'update_as_of', 'value_eur', 'wage_eur', 'weak_foot', 'weight_kg', 'work_rate']

FIFA 25 columns: ['age', 'body_type', 'born', 'club_contract_valid_until', 'club_id', 'club_joined', 'club_kit_num

In [148]:
df_fifa = pd.read_csv(f'{RAW_DATA_DIR}/fifa_data.csv', dtype = str)
# Remove over-detailed columns from df_fifa
# Keep goal-keeper stats
columns_to_remove = ['attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure', 'defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']
df_fifa.drop(columns=columns_to_remove, inplace= True)
#df_fifa.head(10)

In [150]:
#TODO: Add gk cols to  df_25 and map the messed up gk values to the right gk cols
#TODO: Matchup cols -> both df need identical calls for vertical concatenation
#TODO: Handle nans in gk/outfield cols
#TODO: Merge df_fifa and df_25 (vertical concat)
# Then ready for merging with freb stats
# Really just need fuzzy name matching between fifa and fbref, then can merge on that
#df_25.head(10)