In [None]:
import pandas as pd
RAW_DATA_DIR = '../data/raw'

In [None]:
df_25 = pd.read_csv(f'{RAW_DATA_DIR}/fifa_25.csv', dtype = str)
df_25_2 = pd.read_csv(f'{RAW_DATA_DIR}/fifa_25_2.csv', dtype = str)
# Remove 'gender' == 'F' for df_25_2
df_25_2 = df_25_2[df_25_2['gender'] != 'F']
df_25_2 = df_25_2.drop(columns=['gender', 'rank'])
df_25_2.rename(columns={'player_name': 'name'}, inplace=True)
df_25.head(10)

In [None]:
cols_to_keep = ['name','full_name', 'description', 'value','wage','image', 'height_cm', 'weight_kg', 'dob', 'positions', 'overall_rating', 'potential','preferred_foot', 'weak_foot', 'skill_moves', 'international_reputation', 'body_type', 'real_face', 'specialities', 'release_clause', 'club_id', 'club_name', 'club_league_id', 'club_league_name', 'club_logo', 'club_rating', 'club_position', 'club_kit_number', 'club_joined', 'club_contract_valid_until', 'country_id', 'country_name', 'country_league_id', 'country_league_name', 'country_flag', 'country_rating', 'country_position', 'country_kit_number']
# Keep only relevant columns
df_25 = df_25[cols_to_keep]
#df_25.head(10)

In [None]:
# Use description of df_25 to create 'name' column (short name before parenthesis)
df_25['name'] = df_25['description'].str.split('(').str[0].str.strip()
#df_25.head(10)

In [None]:
import re
import unicodedata

def extract_clean_name(row):
    """
    Extract clean name from description or full_name.
    If description has format: "Name (Full Name, born ...)", use Full Name
    Otherwise, use full_name field
    Remove accents and special characters
    """
    description = row['description']
    full_name = row['full_name']
    
    # Check if description has pattern: "Name (Full Name, born"
    # Match: (Full Name, born OR (Full Name Surname, born
    match = re.search(r'\(([^,]+),\s*born', description)
    
    if match:
        # Use the name from parentheses
        name = match.group(1).strip()
    else:
        # No name in parentheses (like "Rodri (born"), use full_name
        name = full_name
    
    # Remove accents and normalize
    # Decompose unicode characters and remove combining characters
    name = unicodedata.normalize('NFD', name)
    name = ''.join(char for char in name if unicodedata.category(char) != 'Mn')
    
    return name.strip()

# Test with examples
test_data = pd.DataFrame({
    'description': [
        'Mohamed Salah (Mohamed Salah Hamed Ghaly, born 15 June 1992) is an Egyptian footballer...',
        'Rodri (born 22 June 1996) is a Spanish footballer...'
    ],
    'full_name': [
        'Mohamed Salah Hamed Ghalyمحمد صلاح',
        'Rodrigo Hernández Cascante'
    ]
})

print("Test results:")
print(test_data.apply(extract_clean_name, axis=1))
print("\nExpected: Mohamed Salah Hamed Ghaly, Rodrigo Hernandez Cascante")

In [None]:
# Apply to df_25
df_25['full_name'] = df_25.apply(extract_clean_name, axis=1)
#df_25.head(10)

In [None]:
# Use nationality of df_25_2 to fill in missing country_name in df_25
df_25_2_subset = df_25_2[['name', 'nationality']].rename(columns={'nationality': 'country_name'})
df_25 = df_25.merge(df_25_2_subset, on='name', how='left', suffixes=('', '_from_25_2'))
df_25['country_name'] = df_25['country_name'].combine_first(df_25['country_name_from_25_2'])
df_25 = df_25.drop(columns=['country_name_from_25_2'])
# Change dob to year 'born'
df_25['born'] = pd.to_datetime(df_25['dob'], errors='coerce').dt.year.astype('str')
df_25 = df_25.drop(columns=['dob'])
#df_25.head(10)

In [None]:
df_25 = df_25.drop_duplicates()
df_25 = df_25.drop_duplicates(subset=['name', 'club_name', 'born'], keep='first')
df_25['fifa_version'] = '25.0'
#df_25.head(10)

In [None]:
# Columns in df_25_2 not in df_25
cols = set(df_25_2.columns) - set(df_25.columns)
print(cols)
#df_25_2.head(10)

In [None]:
# Columns in df_25_2 to add to df_25
cols_to_add = ['pac', 'def', 'pas', 'dri', 'sho', 'phy', 'age', 'weight_(in kg)', 'height_(in cm)', 'position']
df_25_2_subset = df_25_2[['name'] + cols_to_add]
df_25 = df_25.merge(df_25_2_subset, on='name', how='left')
#df_25 is now updated with additional columns from df_25_2
df_25.head(10)
#len(df_25),len(df_25_2)

In [None]:
# Remove rows with nans
cols_to_check = ['pac', 'def', 'pas', 'dri', 'sho', 'phy', 'age']
df_25 = df_25.dropna(subset=cols_to_check)
len(df_25)

df_25.head(10)

In [None]:
# Convert value from €115.5M to numeric 115500000
df_25['value'] = df_25['value'].astype(str)
def convert_value_to_numeric(value_str):
    if pd.isna(value_str) or value_str == 'nan':
        return None
    value_str = value_str.strip()
    # Match €115.5M or €115M or €1.5K format
    match = re.match(r'€([\d\.]+)([MK])', value_str)
    if match:
        number = float(match.group(1))
        suffix = match.group(2)
        if suffix == 'M':
            return int(number * 1_000_000)
        elif suffix == 'K':
            return int(number * 1_000)
    return None
df_25['value'] = df_25['value'].apply(convert_value_to_numeric)
df_25['wage'] = df_25['wage'].astype(str)
df_25['wage'] = df_25['wage'].apply(convert_value_to_numeric)
df_25.head(10)

In [None]:
# Check column names to see what we're working with
# print("FIFA columns:", df_fifa.columns.tolist())
print("\nFIFA 25 columns:", df_25.columns.tolist())

# Check a few rows to confirm full_name matches long_name
# print("\nSample FIFA long_name:", df_fifa['long_name'].head(3).tolist())
print("Sample FIFA 25 full_name:", df_25['full_name'].head(3).tolist())

In [None]:
# Standardize FIFA 25 columns to match FIFA schema
df_25 = df_25.rename(columns={
    'name': 'short_name',
    'full_name': 'long_name'
})

# Add fifa_version column
df_25['fifa_version'] = '25'

# Check that columns align
#print("FIFA columns:", sorted(df_fifa.columns.tolist()))
print("\nFIFA 25 columns:", sorted(df_25.columns.tolist()))

In [None]:
df_fifa = pd.read_csv(f'{RAW_DATA_DIR}/fifa_data.csv', dtype = str)
# Remove over-detailed columns from df_fifa
# Keep goal-keeper stats
columns_to_remove = ['attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure', 'defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']
df_fifa.drop(columns=columns_to_remove, inplace= True)
df_fifa.head(10)

In [None]:
#TODO: Matchup cols -> both df need identical calls for vertical concatenation
#TODO: Handle nans in gk/outfield cols
#TODO: Merge df_fifa and df_25 (vertical concat)
# Then ready for merging with freb stats
# Really just need fuzzy name matching between fifa and fbref, then can merge on that | NAH, 


In [None]:
is_gk = df_25['positions'].str.contains('GK', na=False)
# def -> goalkeeping_speed
# pas -> goalkeeping_kicking
# phy -> goalkeeping_positioning
# dri -> goalkeeping_reflexes
# sho -> goalkeeping_handling
# pac -> goalkeeping_diving
df_25.loc[is_gk, 'goalkeeping_speed'] = df_25.loc[is_gk, 'def']
df_25.loc[is_gk, 'goalkeeping_kicking'] = df_25.loc[is_gk, 'pas']
df_25.loc[is_gk, 'goalkeeping_positioning'] = df_25.loc[is_gk, 'phy']
df_25.loc[is_gk, 'goalkeeping_reflexes'] = df_25.loc[is_gk, 'dri']
df_25.loc[is_gk, 'goalkeeping_handling'] = df_25.loc[is_gk, 'sho']
df_25.loc[is_gk, 'goalkeeping_diving'] = df_25.loc[is_gk, 'pac']

df_25.loc[is_gk, ['pac', 'def', 'pas', 'dri', 'sho', 'phy']] = '10'

In [None]:
#set non GK players gk stats to '10'
df_25.loc[~is_gk, ['goalkeeping_speed', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_handling', 'goalkeeping_diving']] = '10'
#set non GK players 'goalkeeping_speed' to 10 in df_fifa
df_fifa.loc[~df_fifa['player_positions'].str.contains('GK', na=False), ['goalkeeping_speed']] = '10'

In [None]:
cols_to_drop = ['player_id', 'release_clause_eur', 'player_url', 'league_level', 'fifa_update', 'update_as_of', 'club_position', 'club_loaned_from', 'club_joined_date', 'club_contract_valid_until_year', 'nationality_id', 'nation_team_id', 'nation_position', 'nation_jersey_number', 'work_rate', 'real_face', 'player_traits']
df_fifa.drop(columns=cols_to_drop, inplace=True)
# convert dob to born year
df_fifa['born'] = pd.to_datetime(df_fifa['dob'], errors='coerce').dt.year.astype('str')
df_fifa = df_fifa.drop(columns=['dob'])
df_fifa.head(5)

In [None]:
cols_to_drop= ['description', 'real_face', 'country_flag', 'release_clause', 'club_position', 'club_rating', 'club_joined', 'club_contract_valid_until', 'country_id','country_league_name', 'country_league_id', 'country_rating', 'country_position', 'country_kit_number', 'weight_(in kg)', 'height_(in cm)']
#df_25 = df_25.drop(columns=cols_to_drop)
df_25.drop(columns=['club_logo', 'image', 'player_positions'], inplace=True)
df_25.rename(columns={
    'specialities': 'player_tags',
    'club_league_name' : 'league_name',
    'club_id' : 'club_team_id',
    'pac' : 'pace',
    'dri' : 'dribbling',
    'phy' : 'physic',
    'pas' : 'passing',
    'sho' : 'shooting',
    'def' : 'defending',
    'value' : 'value_eur',
    'wage' : 'wage_eur',
    'overall_rating' : 'overall',
    'positions' : 'player_positions',
    'club_kit_number' : 'club_jersey_number',
    'country_name' : 'nationality_name',
    'club_league_id' : 'league_id',
    'position' : 'player_positions'
}, inplace=True)
df_25.head(5)

In [None]:
# Compare columns between df_fifa and df_25
fifa_cols = set(df_fifa.columns)
df_25_cols = set(df_25.columns)

print("Columns in df_fifa but NOT in df_25:")
print(sorted(fifa_cols - df_25_cols))

print("\nColumns in df_25 but NOT in df_fifa:")
print(sorted(df_25_cols - fifa_cols))

print("\nCommon columns:")
print(sorted(fifa_cols & df_25_cols))

print(f"\ndf_fifa has {len(fifa_cols)} columns")
print(f"df_25 has {len(df_25_cols)} columns")
print(f"Common columns: {len(fifa_cols & df_25_cols)}")

In [None]:
# Vertically concatenate df_fifa and df_25
df_combined = pd.concat([df_fifa, df_25], ignore_index=True)

print(f"df_fifa rows: {len(df_fifa)}")
print(f"df_25 rows: {len(df_25)}")
print(f"Combined rows: {len(df_combined)}")
print(f"\nColumns: {len(df_combined.columns)}")
df_combined.head()


In [None]:
# Where position is GK, set outfield stats to '10'
is_gk_combined = df_combined['player_positions'].str.contains('GK', na=False)
outfield_stats = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']
df_combined.loc[is_gk_combined, outfield_stats] = '10'

In [None]:
#df_combined.drop('player_tags', axis=1, inplace=True)
# Create 'season' that is ('fifa version' -1) + 'fifa version'
df_combined['fifa_version'] = df_combined['fifa_version'].astype(float).astype(int).astype(str)
df_combined['season'] = df_combined['fifa_version'].apply(lambda x: f"{int(float(x))-1}{x}").astype(str)
df_combined.head() 

In [None]:
df_combined.to_csv(f'{RAW_DATA_DIR}/fifa_combined.csv', index=False)