In [1]:
import pandas as pd
RAW_DATA_DIR = '../data/raw'

In [2]:
df_25 = pd.read_csv(f'{RAW_DATA_DIR}/fifa_25.csv', dtype = str)
df_25_2 = pd.read_csv(f'{RAW_DATA_DIR}/fifa_25_2.csv', dtype = str)
# Remove 'gender' == 'F' for df_25_2
df_25_2 = df_25_2[df_25_2['gender'] != 'F']
df_25_2 = df_25_2.drop(columns=['gender', 'rank'])
df_25_2.rename(columns={'player_name': 'name'}, inplace=True)
df_25.head(10)

Unnamed: 0,player_id,version,name,full_name,description,image,height_cm,weight_kg,dob,positions,...,composure,defensive_awareness,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,play_styles
0,231866,2025-06-03,Rodri -,Rodrigo Hernández Cascante,Rodri (born 22 June 1996) is a Spanish footbal...,https://cdn.sofifa.net/players/231/866/25_120.png,190,82,1996-06-22,"CDM,CM",...,,,,,,,,,,
1,209331,2025-06-03,Mohamed Salah Hamed Ghaly -,Mohamed Salah Hamed Ghalyمحمد صلاح,"Mohamed Salah (Mohamed Salah Hamed Ghaly, born...",https://cdn.sofifa.net/players/209/331/25_120.png,175,72,1992-06-15,"RM,RW",...,,,,,,,,,,
2,252371,2025-06-03,Jude Victor William Bellingham -,Jude Victor William Bellingham,Jude Bellingham (Jude Victor William Bellingha...,https://cdn.sofifa.net/players/252/371/25_120.png,186,75,2003-06-29,"CAM,CM",...,,,,,,,,,,
3,239085,2025-06-03,Erling Braut Håland -,Erling Braut Håland,"Erling Haaland (Erling Braut Håland, born 21 J...",https://cdn.sofifa.net/players/239/085/25_120.png,195,94,2000-07-21,ST,...,,,,,,,,,,
4,238794,2025-06-03,Vini Jr. -,Vinicius José Paixão de Oliveira Junior,Vini Jr. (Vinicius José Paixão de Oliveira Jun...,https://cdn.sofifa.net/players/238/794/25_120.png,176,73,2000-07-12,"LW,ST",...,,,,,,,,,,
5,231747,2025-06-03,Kylian Mbappé Lottin -,Kylian Mbappé Lottin,"Kylian Mbappé (Kylian Mbappé Lottin, born 20 D...",https://cdn.sofifa.net/players/231/747/25_120.png,182,75,1998-12-20,"ST,LW",...,,,,,,,,,,
6,203376,2025-06-03,Virgil van Dijk -,Virgil van Dijk,Virgil van Dijk (born 8 July 1991) is a Dutch ...,https://cdn.sofifa.net/players/203/376/25_120.png,193,92,1991-07-08,CB,...,,,,,,,,,,
7,202126,2025-06-03,Harry Edward Kane -,Harry Edward Kane,"Harry Kane (Harry Edward Kane, born 28 July 19...",https://cdn.sofifa.net/players/202/126/25_120.png,188,86,1993-07-28,ST,...,,,,,,,,,,
8,256630,2025-06-03,Florian Richard Wirtz -,Florian Richard Wirtz,"Florian Wirtz (Florian Richard Wirtz, born 3 M...",https://cdn.sofifa.net/players/256/630/25_120.png,177,71,2003-05-03,"CAM,ST",...,,,,,,,,,,
9,212831,2025-06-03,Alisson -,Alisson Ramsés Becker,"Alisson (Alisson Ramsés Becker, born 2 October...",https://cdn.sofifa.net/players/212/831/25_120.png,193,91,1992-10-02,GK,...,,,,,,,,,,


In [3]:
cols_to_keep = ['name','full_name', 'description', 'value','wage','image', 'height_cm', 'weight_kg', 'dob', 'positions', 'overall_rating', 'potential','preferred_foot', 'weak_foot', 'skill_moves', 'international_reputation', 'body_type', 'real_face', 'specialities', 'release_clause', 'club_id', 'club_name', 'club_league_id', 'club_league_name', 'club_logo', 'club_rating', 'club_position', 'club_kit_number', 'club_joined', 'club_contract_valid_until', 'country_id', 'country_name', 'country_league_id', 'country_league_name', 'country_flag', 'country_rating', 'country_position', 'country_kit_number']
# Keep only relevant columns
df_25 = df_25[cols_to_keep]
#df_25.head(10)

In [4]:
# Use description of df_25 to create 'name' column (short name before parenthesis)
df_25['name'] = df_25['description'].str.split('(').str[0].str.strip()
#df_25.head(10)

In [5]:
import re
import unicodedata

def extract_clean_name(row):
    """
    Extract clean name from description or full_name.
    If description has format: "Name (Full Name, born ...)", use Full Name
    Otherwise, use full_name field
    Remove accents and special characters
    """
    description = row['description']
    full_name = row['full_name']
    
    # Check if description has pattern: "Name (Full Name, born"
    # Match: (Full Name, born OR (Full Name Surname, born
    match = re.search(r'\(([^,]+),\s*born', description)
    
    if match:
        # Use the name from parentheses
        name = match.group(1).strip()
    else:
        # No name in parentheses (like "Rodri (born"), use full_name
        name = full_name
    
    # Remove accents and normalize
    # Decompose unicode characters and remove combining characters
    name = unicodedata.normalize('NFD', name)
    name = ''.join(char for char in name if unicodedata.category(char) != 'Mn')
    
    return name.strip()

# Test with examples
test_data = pd.DataFrame({
    'description': [
        'Mohamed Salah (Mohamed Salah Hamed Ghaly, born 15 June 1992) is an Egyptian footballer...',
        'Rodri (born 22 June 1996) is a Spanish footballer...'
    ],
    'full_name': [
        'Mohamed Salah Hamed Ghalyمحمد صلاح',
        'Rodrigo Hernández Cascante'
    ]
})

print("Test results:")
print(test_data.apply(extract_clean_name, axis=1))
print("\nExpected: Mohamed Salah Hamed Ghaly, Rodrigo Hernandez Cascante")

Test results:
0     Mohamed Salah Hamed Ghaly
1    Rodrigo Hernandez Cascante
dtype: object

Expected: Mohamed Salah Hamed Ghaly, Rodrigo Hernandez Cascante


In [6]:
# Apply to df_25
df_25['full_name'] = df_25.apply(extract_clean_name, axis=1)
#df_25.head(10)

In [7]:
# Use nationality of df_25_2 to fill in missing country_name in df_25
df_25_2_subset = df_25_2[['name', 'nationality']].rename(columns={'nationality': 'country_name'})
df_25 = df_25.merge(df_25_2_subset, on='name', how='left', suffixes=('', '_from_25_2'))
df_25['country_name'] = df_25['country_name'].combine_first(df_25['country_name_from_25_2'])
df_25 = df_25.drop(columns=['country_name_from_25_2'])
# Change dob to year 'born'
df_25['born'] = pd.to_datetime(df_25['dob'], errors='coerce').dt.year.astype('str')
df_25 = df_25.drop(columns=['dob'])
#df_25.head(10)

In [8]:
df_25 = df_25.drop_duplicates()
df_25 = df_25.drop_duplicates(subset=['name', 'club_name', 'born'], keep='first')
df_25['fifa_version'] = '25.0'
#df_25.head(10)

In [9]:
# Columns in df_25_2 not in df_25
cols = set(df_25_2.columns) - set(df_25.columns)
print(cols)
#df_25_2.head(10)

{'preffered_foot', 'pac+-', 'sho+-', 'dri+-', 'image_url', 'dri', 'phy', 'def', 'pac', 'club', 'phy+-', 'sho', 'pas+-', 'nationality', 'league', 'position', 'weight_(in kg)', 'height_(in cm)', 'alt_position', 'pas', 'def+-', 'ovr', 'age'}


In [10]:
# Columns in df_25_2 to add to df_25
cols_to_add = ['pac', 'def', 'pas', 'dri', 'sho', 'phy', 'age', 'weight_(in kg)', 'height_(in cm)', 'position']
df_25_2_subset = df_25_2[['name'] + cols_to_add]
df_25 = df_25.merge(df_25_2_subset, on='name', how='left')
#df_25 is now updated with additional columns from df_25_2
#df_25.head(10)
len(df_25),len(df_25_2)

(18396, 15905)

In [11]:
# Remove rows with nans
cols_to_check = ['pac', 'def', 'pas', 'dri', 'sho', 'phy', 'age']
df_25 = df_25.dropna(subset=cols_to_check)
len(df_25)
df_25.head(10)

Unnamed: 0,name,full_name,description,value,wage,image,height_cm,weight_kg,positions,overall_rating,...,pac,def,pas,dri,sho,phy,age,weight_(in kg),height_(in cm),position
0,Rodri,Rodrigo Hernandez Cascante,Rodri (born 22 June 1996) is a Spanish footbal...,€115.5M,€440K,https://cdn.sofifa.net/players/231/866/25_120.png,190,82,"CDM,CM",91,...,66,87,86,84,80,85,28,82,191,CDM
1,Mohamed Salah,Mohamed Salah Hamed Ghaly,"Mohamed Salah (Mohamed Salah Hamed Ghaly, born...",€104M,€350K,https://cdn.sofifa.net/players/209/331/25_120.png,175,72,"RM,RW",91,...,89,45,82,88,87,75,32,72,175,RW
2,Jude Bellingham,Jude Victor William Bellingham,Jude Bellingham (Jude Victor William Bellingha...,€174.5M,€280K,https://cdn.sofifa.net/players/252/371/25_120.png,186,75,"CAM,CM",90,...,80,78,83,88,87,83,21,75,186,CAM
3,Erling Haaland,Erling Braut Haland,"Erling Haaland (Erling Braut Håland, born 21 J...",€157M,€270K,https://cdn.sofifa.net/players/239/085/25_120.png,195,94,ST,90,...,88,45,70,81,92,88,24,94,195,ST
4,Vini Jr.,Vinicius Jose Paixao de Oliveira Junior,Vini Jr. (Vinicius José Paixão de Oliveira Jun...,€171.5M,€340K,https://cdn.sofifa.net/players/238/794/25_120.png,176,73,"LW,ST",90,...,95,29,81,91,84,69,24,73,176,LW
5,Kylian Mbappé,Kylian Mbappe Lottin,"Kylian Mbappé (Kylian Mbappé Lottin, born 20 D...",€160M,€380K,https://cdn.sofifa.net/players/231/747/25_120.png,182,75,"ST,LW",90,...,97,36,80,92,90,78,25,75,182,ST
6,Virgil van Dijk,Virgil van Dijk,Virgil van Dijk (born 8 July 1991) is a Dutch ...,€77.5M,€220K,https://cdn.sofifa.net/players/203/376/25_120.png,193,92,CB,90,...,78,89,71,71,60,86,33,92,193,CB
7,Harry Kane,Harry Edward Kane,"Harry Kane (Harry Edward Kane, born 28 July 19...",€117.5M,€170K,https://cdn.sofifa.net/players/202/126/25_120.png,188,86,ST,90,...,65,49,84,83,93,82,31,86,188,ST
8,Florian Wirtz,Florian Richard Wirtz,"Florian Wirtz (Florian Richard Wirtz, born 3 M...",€143.5M,€125K,https://cdn.sofifa.net/players/256/630/25_120.png,177,71,"CAM,ST",89,...,81,50,87,89,78,67,21,71,177,CAM
9,Alisson,Alisson Ramses Becker,"Alisson (Alisson Ramsés Becker, born 2 October...",€54.5M,€130K,https://cdn.sofifa.net/players/212/831/25_120.png,193,91,GK,89,...,86,56,85,89,85,90,32,91,193,GK


In [12]:
# Convert value from €115.5M to numeric 115500000
df_25['value'] = df_25['value'].astype(str)
def convert_value_to_numeric(value_str):
    if pd.isna(value_str) or value_str == 'nan':
        return None
    value_str = value_str.strip()
    # Match €115.5M or €115M or €1.5K format
    match = re.match(r'€([\d\.]+)([MK])', value_str)
    if match:
        number = float(match.group(1))
        suffix = match.group(2)
        if suffix == 'M':
            return int(number * 1_000_000)
        elif suffix == 'K':
            return int(number * 1_000)
    return None
df_25['value'] = df_25['value'].apply(convert_value_to_numeric)
df_25['wage'] = df_25['wage'].astype(str)
df_25['wage'] = df_25['wage'].apply(convert_value_to_numeric)
df_25.head(10)

Unnamed: 0,name,full_name,description,value,wage,image,height_cm,weight_kg,positions,overall_rating,...,pac,def,pas,dri,sho,phy,age,weight_(in kg),height_(in cm),position
0,Rodri,Rodrigo Hernandez Cascante,Rodri (born 22 June 1996) is a Spanish footbal...,115500000.0,440000.0,https://cdn.sofifa.net/players/231/866/25_120.png,190,82,"CDM,CM",91,...,66,87,86,84,80,85,28,82,191,CDM
1,Mohamed Salah,Mohamed Salah Hamed Ghaly,"Mohamed Salah (Mohamed Salah Hamed Ghaly, born...",104000000.0,350000.0,https://cdn.sofifa.net/players/209/331/25_120.png,175,72,"RM,RW",91,...,89,45,82,88,87,75,32,72,175,RW
2,Jude Bellingham,Jude Victor William Bellingham,Jude Bellingham (Jude Victor William Bellingha...,174500000.0,280000.0,https://cdn.sofifa.net/players/252/371/25_120.png,186,75,"CAM,CM",90,...,80,78,83,88,87,83,21,75,186,CAM
3,Erling Haaland,Erling Braut Haland,"Erling Haaland (Erling Braut Håland, born 21 J...",157000000.0,270000.0,https://cdn.sofifa.net/players/239/085/25_120.png,195,94,ST,90,...,88,45,70,81,92,88,24,94,195,ST
4,Vini Jr.,Vinicius Jose Paixao de Oliveira Junior,Vini Jr. (Vinicius José Paixão de Oliveira Jun...,171500000.0,340000.0,https://cdn.sofifa.net/players/238/794/25_120.png,176,73,"LW,ST",90,...,95,29,81,91,84,69,24,73,176,LW
5,Kylian Mbappé,Kylian Mbappe Lottin,"Kylian Mbappé (Kylian Mbappé Lottin, born 20 D...",160000000.0,380000.0,https://cdn.sofifa.net/players/231/747/25_120.png,182,75,"ST,LW",90,...,97,36,80,92,90,78,25,75,182,ST
6,Virgil van Dijk,Virgil van Dijk,Virgil van Dijk (born 8 July 1991) is a Dutch ...,77500000.0,220000.0,https://cdn.sofifa.net/players/203/376/25_120.png,193,92,CB,90,...,78,89,71,71,60,86,33,92,193,CB
7,Harry Kane,Harry Edward Kane,"Harry Kane (Harry Edward Kane, born 28 July 19...",117500000.0,170000.0,https://cdn.sofifa.net/players/202/126/25_120.png,188,86,ST,90,...,65,49,84,83,93,82,31,86,188,ST
8,Florian Wirtz,Florian Richard Wirtz,"Florian Wirtz (Florian Richard Wirtz, born 3 M...",143500000.0,125000.0,https://cdn.sofifa.net/players/256/630/25_120.png,177,71,"CAM,ST",89,...,81,50,87,89,78,67,21,71,177,CAM
9,Alisson,Alisson Ramses Becker,"Alisson (Alisson Ramsés Becker, born 2 October...",54500000.0,130000.0,https://cdn.sofifa.net/players/212/831/25_120.png,193,91,GK,89,...,86,56,85,89,85,90,32,91,193,GK


In [13]:
# Check column names to see what we're working with
# print("FIFA columns:", df_fifa.columns.tolist())
print("\nFIFA 25 columns:", df_25.columns.tolist())

# Check a few rows to confirm full_name matches long_name
# print("\nSample FIFA long_name:", df_fifa['long_name'].head(3).tolist())
print("Sample FIFA 25 full_name:", df_25['full_name'].head(3).tolist())


FIFA 25 columns: ['name', 'full_name', 'description', 'value', 'wage', 'image', 'height_cm', 'weight_kg', 'positions', 'overall_rating', 'potential', 'preferred_foot', 'weak_foot', 'skill_moves', 'international_reputation', 'body_type', 'real_face', 'specialities', 'release_clause', 'club_id', 'club_name', 'club_league_id', 'club_league_name', 'club_logo', 'club_rating', 'club_position', 'club_kit_number', 'club_joined', 'club_contract_valid_until', 'country_id', 'country_name', 'country_league_id', 'country_league_name', 'country_flag', 'country_rating', 'country_position', 'country_kit_number', 'born', 'fifa_version', 'pac', 'def', 'pas', 'dri', 'sho', 'phy', 'age', 'weight_(in kg)', 'height_(in cm)', 'position']
Sample FIFA 25 full_name: ['Rodrigo Hernandez Cascante', 'Mohamed Salah Hamed Ghaly', 'Jude Victor William Bellingham']


In [14]:
# Standardize FIFA 25 columns to match FIFA schema
df_25 = df_25.rename(columns={
    'name': 'short_name',
    'full_name': 'long_name'
})

# Add fifa_version column
df_25['fifa_version'] = '25'

# Check that columns align
#print("FIFA columns:", sorted(df_fifa.columns.tolist()))
print("\nFIFA 25 columns:", sorted(df_25.columns.tolist()))


FIFA 25 columns: ['age', 'body_type', 'born', 'club_contract_valid_until', 'club_id', 'club_joined', 'club_kit_number', 'club_league_id', 'club_league_name', 'club_logo', 'club_name', 'club_position', 'club_rating', 'country_flag', 'country_id', 'country_kit_number', 'country_league_id', 'country_league_name', 'country_name', 'country_position', 'country_rating', 'def', 'description', 'dri', 'fifa_version', 'height_(in cm)', 'height_cm', 'image', 'international_reputation', 'long_name', 'overall_rating', 'pac', 'pas', 'phy', 'position', 'positions', 'potential', 'preferred_foot', 'real_face', 'release_clause', 'sho', 'short_name', 'skill_moves', 'specialities', 'value', 'wage', 'weak_foot', 'weight_(in kg)', 'weight_kg']


In [15]:
df_fifa = pd.read_csv(f'{RAW_DATA_DIR}/fifa_data.csv', dtype = str)
# Remove over-detailed columns from df_fifa
# Keep goal-keeper stats
columns_to_remove = ['attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure', 'defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']
df_fifa.drop(columns=columns_to_remove, inplace= True)
df_fifa.head(10)

Unnamed: 0,player_id,player_url,fifa_version,fifa_update,update_as_of,short_name,long_name,player_positions,overall,potential,...,passing,dribbling,defending,physic,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed
0,231747,/player/231747/kylian-mbappe/240002,24.0,2.0,2023-09-22,K. Mbappé,Kylian Mbappé Lottin,"ST, LW",91,94,...,80.0,92.0,36.0,78.0,13,5,7,11,6,
1,239085,/player/239085/erling-haaland/240002,24.0,2.0,2023-09-22,E. Haaland,Erling Braut Haaland,ST,91,94,...,66.0,80.0,45.0,88.0,7,14,13,11,7,
2,192985,/player/192985/kevin-de-bruyne/240002,24.0,2.0,2023-09-22,K. De Bruyne,Kevin De Bruyne,"CM, CAM",91,91,...,94.0,87.0,65.0,78.0,15,13,5,10,13,
3,158023,/player/158023/lionel-messi/240002,24.0,2.0,2023-09-22,L. Messi,Lionel Andrés Messi Cuccittini,"CF, CAM",90,90,...,90.0,94.0,33.0,64.0,6,11,15,14,8,
4,165153,/player/165153/karim-benzema/240002,24.0,2.0,2023-09-22,K. Benzema,Karim Benzema,"CF, ST",90,90,...,83.0,87.0,39.0,78.0,13,11,5,5,7,
5,188545,/player/188545/robert-lewandowski/240002,24.0,2.0,2023-09-22,R. Lewandowski,Robert Lewandowski,ST,90,90,...,80.0,87.0,44.0,84.0,15,6,12,8,10,
6,192119,/player/192119/thibaut-courtois/240002,24.0,2.0,2023-09-22,T. Courtois,Thibaut Nicolas Marc Courtois,GK,90,90,...,,,,,85,89,76,90,93,46.0
7,202126,/player/202126/harry-kane/240002,24.0,2.0,2023-09-22,H. Kane,Harry Kane,ST,90,90,...,84.0,83.0,49.0,83.0,8,10,11,14,11,
8,238794,/player/238794/vinicius-jose-de-oliveira-junio...,24.0,2.0,2023-09-22,Vini Jr.,Vinícius José Paixão de Oliveira Júnior,LW,89,94,...,78.0,90.0,29.0,68.0,5,7,7,7,10,
9,212831,/player/212831/alisson-ramses-becker/240002,24.0,2.0,2023-09-22,Alisson,Alisson Ramsés Becker,GK,89,90,...,,,,,86,85,85,90,89,56.0


In [16]:
#TODO: Matchup cols -> both df need identical calls for vertical concatenation
#TODO: Handle nans in gk/outfield cols
#TODO: Merge df_fifa and df_25 (vertical concat)
# Then ready for merging with freb stats
# Really just need fuzzy name matching between fifa and fbref, then can merge on that | NAH, 


In [17]:
is_gk = df_25['positions'].str.contains('GK', na=False)
# def -> goalkeeping_speed
# pas -> goalkeeping_kicking
# phy -> goalkeeping_positioning
# dri -> goalkeeping_reflexes
# sho -> goalkeeping_handling
# pac -> goalkeeping_diving
df_25.loc[is_gk, 'goalkeeping_speed'] = df_25.loc[is_gk, 'def']
df_25.loc[is_gk, 'goalkeeping_kicking'] = df_25.loc[is_gk, 'pas']
df_25.loc[is_gk, 'goalkeeping_positioning'] = df_25.loc[is_gk, 'phy']
df_25.loc[is_gk, 'goalkeeping_reflexes'] = df_25.loc[is_gk, 'dri']
df_25.loc[is_gk, 'goalkeeping_handling'] = df_25.loc[is_gk, 'sho']
df_25.loc[is_gk, 'goalkeeping_diving'] = df_25.loc[is_gk, 'pac']

df_25.loc[is_gk, ['pac', 'def', 'pas', 'dri', 'sho', 'phy']] = '10'

In [18]:
#set non GK players gk stats to '10'
df_25.loc[~is_gk, ['goalkeeping_speed', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_handling', 'goalkeeping_diving']] = '10'
#set non GK players 'goalkeeping_speed' to 10 in df_fifa
df_fifa.loc[~df_fifa['player_positions'].str.contains('GK', na=False), ['goalkeeping_speed']] = '10'

In [19]:
cols_to_drop = ['player_id', 'release_clause_eur', 'player_url', 'league_level', 'fifa_update', 'update_as_of', 'club_position', 'club_loaned_from', 'club_joined_date', 'club_contract_valid_until_year', 'nationality_id', 'nation_team_id', 'nation_position', 'nation_jersey_number', 'work_rate', 'real_face', 'player_traits']
df_fifa.drop(columns=cols_to_drop, inplace=True)
# convert dob to born year
df_fifa['born'] = pd.to_datetime(df_fifa['dob'], errors='coerce').dt.year.astype('str')
df_fifa = df_fifa.drop(columns=['dob'])
df_fifa.head(5)

Unnamed: 0,fifa_version,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,height_cm,...,dribbling,defending,physic,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed,born
0,24.0,K. Mbappé,Kylian Mbappé Lottin,"ST, LW",91,94,181500000.0,230000.0,24,182,...,92,36,78,13,5,7,11,6,10,1998
1,24.0,E. Haaland,Erling Braut Haaland,ST,91,94,185000000.0,340000.0,22,195,...,80,45,88,7,14,13,11,7,10,2000
2,24.0,K. De Bruyne,Kevin De Bruyne,"CM, CAM",91,91,103000000.0,350000.0,32,181,...,87,65,78,15,13,5,10,13,10,1991
3,24.0,L. Messi,Lionel Andrés Messi Cuccittini,"CF, CAM",90,90,41000000.0,23000.0,36,169,...,94,33,64,6,11,15,14,8,10,1987
4,24.0,K. Benzema,Karim Benzema,"CF, ST",90,90,51000000.0,95000.0,35,185,...,87,39,78,13,11,5,5,7,10,1987


In [24]:
cols_to_drop= ['description', 'real_face', 'country_flag', 'release_clause', 'club_position', 'club_rating', 'club_joined', 'club_contract_valid_until', 'country_id','country_league_name', 'country_league_id', 'country_rating', 'country_position', 'country_kit_number', 'weight_(in kg)', 'height_(in cm)']
#df_25 = df_25.drop(columns=cols_to_drop)
df_25.drop(columns=['club_logo', 'image', 'player_positions'], inplace=True)
df_25.rename(columns={
    'specialities': 'player_tags',
    'club_league_name' : 'league_name',
    'club_id' : 'club_team_id',
    'pac' : 'pace',
    'dri' : 'dribbling',
    'phy' : 'physic',
    'pas' : 'passing',
    'sho' : 'shooting',
    'def' : 'defending',
    'value' : 'value_eur',
    'wage' : 'wage_eur',
    'overall_rating' : 'overall',
    'positions' : 'player_positions',
    'club_kit_number' : 'club_jersey_number',
    'country_name' : 'nationality_name',
    'club_league_id' : 'league_id',
    'position' : 'player_positions'
}, inplace=True)
df_25.head(5)

Unnamed: 0,short_name,long_name,value_eur,wage_eur,height_cm,weight_kg,overall,potential,preferred_foot,weak_foot,...,shooting,physic,age,player_positions,goalkeeping_speed,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_handling,goalkeeping_diving
0,Rodri,Rodrigo Hernandez Cascante,115500000.0,440000.0,190,82,91,91,Right,4,...,80,85,28,CDM,10,10,10,10,10,10
1,Mohamed Salah,Mohamed Salah Hamed Ghaly,104000000.0,350000.0,175,72,91,91,Left,3,...,87,75,32,RW,10,10,10,10,10,10
2,Jude Bellingham,Jude Victor William Bellingham,174500000.0,280000.0,186,75,90,94,Right,4,...,87,83,21,CAM,10,10,10,10,10,10
3,Erling Haaland,Erling Braut Haland,157000000.0,270000.0,195,94,90,92,Left,3,...,92,88,24,ST,10,10,10,10,10,10
4,Vini Jr.,Vinicius Jose Paixao de Oliveira Junior,171500000.0,340000.0,176,73,90,94,Right,4,...,84,69,24,LW,10,10,10,10,10,10


In [26]:
# Compare columns between df_fifa and df_25
fifa_cols = set(df_fifa.columns)
df_25_cols = set(df_25.columns)

print("Columns in df_fifa but NOT in df_25:")
print(sorted(fifa_cols - df_25_cols))

print("\nColumns in df_25 but NOT in df_fifa:")
print(sorted(df_25_cols - fifa_cols))

print("\nCommon columns:")
print(sorted(fifa_cols & df_25_cols))

print(f"\ndf_fifa has {len(fifa_cols)} columns")
print(f"df_25 has {len(df_25_cols)} columns")
print(f"Common columns: {len(fifa_cols & df_25_cols)}")

Columns in df_fifa but NOT in df_25:
[]

Columns in df_25 but NOT in df_fifa:
[]

Common columns:
['age', 'body_type', 'born', 'club_jersey_number', 'club_name', 'club_team_id', 'defending', 'dribbling', 'fifa_version', 'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed', 'height_cm', 'international_reputation', 'league_id', 'league_name', 'long_name', 'nationality_name', 'overall', 'pace', 'passing', 'physic', 'player_positions', 'player_tags', 'potential', 'preferred_foot', 'shooting', 'short_name', 'skill_moves', 'value_eur', 'wage_eur', 'weak_foot', 'weight_kg']

df_fifa has 36 columns
df_25 has 36 columns
Common columns: 36


In [27]:
# Vertically concatenate df_fifa and df_25
df_combined = pd.concat([df_fifa, df_25], ignore_index=True)

print(f"df_fifa rows: {len(df_fifa)}")
print(f"df_25 rows: {len(df_25)}")
print(f"Combined rows: {len(df_combined)}")
print(f"\nColumns: {len(df_combined.columns)}")
df_combined.head()

df_fifa rows: 180021
df_25 rows: 13580
Combined rows: 193601

Columns: 36


Unnamed: 0,fifa_version,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,height_cm,...,dribbling,defending,physic,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed,born
0,24.0,K. Mbappé,Kylian Mbappé Lottin,"ST, LW",91,94,181500000.0,230000.0,24,182,...,92,36,78,13,5,7,11,6,10,1998
1,24.0,E. Haaland,Erling Braut Haaland,ST,91,94,185000000.0,340000.0,22,195,...,80,45,88,7,14,13,11,7,10,2000
2,24.0,K. De Bruyne,Kevin De Bruyne,"CM, CAM",91,91,103000000.0,350000.0,32,181,...,87,65,78,15,13,5,10,13,10,1991
3,24.0,L. Messi,Lionel Andrés Messi Cuccittini,"CF, CAM",90,90,41000000.0,23000.0,36,169,...,94,33,64,6,11,15,14,8,10,1987
4,24.0,K. Benzema,Karim Benzema,"CF, ST",90,90,51000000.0,95000.0,35,185,...,87,39,78,13,11,5,5,7,10,1987


In [28]:
df_combined.to_csv(f'{RAW_DATA_DIR}/fifa_combined.csv', index=False)