# Transfermarkt Big5 final Dataframe Cleaning

## Table of Contents:
### 1. Import libraries and Data
### 2. Data Cleaning


### 1. Import libraries and Data

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
#Remove scientific notation
pd.options.display.float_format='{:.4f}'.format

In [3]:
%matplotlib inline

In [4]:
#Create import path
path = r'C:\Users\Utilizador\OneDrive\Documents\CareerFoundry - Data Analyst\Data Immersion\A6\24-03-2024 TransferMarkt'

In [5]:
#import data
big5 = pd.read_pickle(os.path.join(path,'02 Data','Prepared Data','Clean_recent_big5.pkl'))

### 2. Data Cleaning

In [6]:
big5.shape

(521278, 23)

In [7]:
big5.columns

Index(['appearance_id', 'player_id', 'club_id', 'current_club_id', 'date',
       'player_name', 'competition_id', 'last_season',
       'country_of_citizenship', 'date_of_birth', 'sub_position', 'position',
       'foot', 'current_club_name', 'market_value_in_eur',
       'highest_market_value_in_eur', 'club_name', 'domestic_competition_id',
       'competition_name', 'competition_country', 'domestic_league_code',
       'confederation', 'is_major_national_league'],
      dtype='object')

In [8]:
#Delete unneeded variables
big5_df = big5.drop (columns = ['appearance_id','club_id','current_club_id','competition_id','domestic_competition_id','confederation', 'is_major_national_league'])

In [9]:
big5_df.shape

(521278, 16)

In [10]:
big5_df.columns

Index(['player_id', 'date', 'player_name', 'last_season',
       'country_of_citizenship', 'date_of_birth', 'sub_position', 'position',
       'foot', 'current_club_name', 'market_value_in_eur',
       'highest_market_value_in_eur', 'club_name', 'competition_name',
       'competition_country', 'domestic_league_code'],
      dtype='object')

In [11]:
#Change order of the variables
desired_column_order = ['player_id','player_name', 'date_of_birth', 'country_of_citizenship', 'position', 'sub_position', 'foot', 'current_club_name', 
                        'last_season', 'market_value_in_eur', 'highest_market_value_in_eur', 'competition_name', 'competition_country', 'date',
                        'domestic_league_code', 'club_name']  

# Reorder the columns of the DataFrame
big5_df = big5_df[desired_column_order]


In [12]:
big5_df.columns

Index(['player_id', 'player_name', 'date_of_birth', 'country_of_citizenship',
       'position', 'sub_position', 'foot', 'current_club_name', 'last_season',
       'market_value_in_eur', 'highest_market_value_in_eur',
       'competition_name', 'competition_country', 'date',
       'domestic_league_code', 'club_name'],
      dtype='object')

In [13]:
big5_df.head()

Unnamed: 0,player_id,player_name,date_of_birth,country_of_citizenship,position,sub_position,foot,current_club_name,last_season,market_value_in_eur,highest_market_value_in_eur,competition_name,competition_country,date,domestic_league_code,club_name
29,26267,Fernandinho,1985-05-04,Brazil,Midfield,Defensive Midfield,right,Manchester City Football Club,2021.0,600000.0,32000000.0,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk
39,55735,Henrikh Mkhitaryan,1989-01-21,Armenia,Midfield,Central Midfield,both,Football Club Internazionale Milano S.p.A.,2023.0,6000000.0,37000000.0,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk
48,75615,Douglas Costa,1990-09-14,Brazil,Attack,Right Winger,left,Juventus Football Club,2020.0,1500000.0,55000000.0,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk
101,73185,Lucas Pérez,1988-09-10,Spain,Attack,Centre-Forward,left,Cádiz Club de Fútbol S.A.D,2022.0,800000.0,17000000.0,premier-liga,Ukraine,2012-07-13,UKR1,Karpaty Lviv (-2021)
289,42678,Salvatore Bocchetti,1986-11-30,Italy,Defender,Centre-Back,left,Verona Hellas Football Club,2020.0,200000.0,14000000.0,russian-super-cup,Russia,2012-07-14,RU1,FC Rubin Kazan


In [14]:
#check data type
big5_df.dtypes

player_id                       object
player_name                     object
date_of_birth                   object
country_of_citizenship          object
position                        object
sub_position                    object
foot                            object
current_club_name               object
last_season                    float64
market_value_in_eur            float64
highest_market_value_in_eur    float64
competition_name                object
competition_country             object
date                            object
domestic_league_code            object
club_name                       object
dtype: object

In [15]:
#change float 'last_season' to integer
big5_df[['last_season','market_value_in_eur','highest_market_value_in_eur']] = big5_df[['last_season','market_value_in_eur','highest_market_value_in_eur']].astype('int64')
big5_df[['last_season','market_value_in_eur','highest_market_value_in_eur']].value_counts()


last_season  market_value_in_eur  highest_market_value_in_eur
2023         15000000             15000000                       4916
             35000000             35000000                       3831
             10000000             10000000                       3491
             20000000             20000000                       3451
             40000000             40000000                       3048
                                                                 ... 
2022         125000               450000                            1
2020         200000               450000                            1
2022         150000               400000                            1
2021         200000               250000                            1
2020         25000                25000                             1
Name: count, Length: 1958, dtype: int64

In [16]:
#check for mixed type data
for col in big5_df.columns.tolist():
      weird = (big5_df[[col]].map(type) != big5_df[[col]].iloc[0].apply(type)).any(axis = 1)
      if len (big5_df[weird]) > 0:
        print (col)

No mixed types

In [17]:
#check for missing values
big5_df.isnull().sum()

player_id                      0
player_name                    0
date_of_birth                  0
country_of_citizenship         0
position                       0
sub_position                   0
foot                           0
current_club_name              0
last_season                    0
market_value_in_eur            0
highest_market_value_in_eur    0
competition_name               0
competition_country            0
date                           0
domestic_league_code           0
club_name                      0
dtype: int64

No missing values

In [18]:
#check for duplicates
big5_df_dups = big5_df[big5_df.duplicated()]

No duplicates

In [19]:
big5_df['sub_position'].value_counts(dropna=False)

sub_position
Centre-Back           94597
Centre-Forward        75296
Central Midfield      69377
Defensive Midfield    47423
Goalkeeper            42459
Right-Back            41701
Left-Back             36814
Attacking Midfield    33825
Right Winger          33435
Left Winger           33394
Left Midfield          4870
Right Midfield         4588
Second Striker         3453
nan                      46
Name: count, dtype: int64

In [20]:
# Convert 'date_of_birth' to datetime object - (used AI-ChatGPT for this)
from datetime import datetime

# Assuming 'date_of_birth' is your date variable in YYYY-MM-DD format
# Convert 'date_of_birth' to datetime object
big5_df['date_of_birth'] = pd.to_datetime(big5_df['date_of_birth'])

# Calculate age based on 'date_of_birth' and current date
current_date = datetime.now()
big5_df['age'] = (current_date - big5_df['date_of_birth']).dt.days // 365

# Display the DataFrame with the new 'age' variable
big5_df.head()


Unnamed: 0,player_id,player_name,date_of_birth,country_of_citizenship,position,sub_position,foot,current_club_name,last_season,market_value_in_eur,highest_market_value_in_eur,competition_name,competition_country,date,domestic_league_code,club_name,age
29,26267,Fernandinho,1985-05-04,Brazil,Midfield,Defensive Midfield,right,Manchester City Football Club,2021,600000,32000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,38.0
39,55735,Henrikh Mkhitaryan,1989-01-21,Armenia,Midfield,Central Midfield,both,Football Club Internazionale Milano S.p.A.,2023,6000000,37000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,35.0
48,75615,Douglas Costa,1990-09-14,Brazil,Attack,Right Winger,left,Juventus Football Club,2020,1500000,55000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,33.0
101,73185,Lucas Pérez,1988-09-10,Spain,Attack,Centre-Forward,left,Cádiz Club de Fútbol S.A.D,2022,800000,17000000,premier-liga,Ukraine,2012-07-13,UKR1,Karpaty Lviv (-2021),35.0
289,42678,Salvatore Bocchetti,1986-11-30,Italy,Defender,Centre-Back,left,Verona Hellas Football Club,2020,200000,14000000,russian-super-cup,Russia,2012-07-14,RU1,FC Rubin Kazan,37.0


In [21]:
big5_df.describe()

Unnamed: 0,date_of_birth,last_season,market_value_in_eur,highest_market_value_in_eur,age
count,521159,521278.0,521278.0,521278.0,521159.0
mean,1993-09-30 05:04:33.081343872,2022.428,10857626.8325,24759677.178,30.0375
min,1977-09-13 00:00:00,2020.0,25000.0,25000.0,16.0
25%,1990-06-22 00:00:00,2022.0,1200000.0,7000000.0,27.0
50%,1993-10-23 00:00:00,2023.0,4000000.0,15000000.0,30.0
75%,1997-02-11 00:00:00,2023.0,12000000.0,32000000.0,33.0
max,2007-10-02 00:00:00,2023.0,180000000.0,200000000.0,46.0
std,,0.9302,17739417.2305,26717803.604,4.6045


In [22]:
big5_df.head()

Unnamed: 0,player_id,player_name,date_of_birth,country_of_citizenship,position,sub_position,foot,current_club_name,last_season,market_value_in_eur,highest_market_value_in_eur,competition_name,competition_country,date,domestic_league_code,club_name,age
29,26267,Fernandinho,1985-05-04,Brazil,Midfield,Defensive Midfield,right,Manchester City Football Club,2021,600000,32000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,38.0
39,55735,Henrikh Mkhitaryan,1989-01-21,Armenia,Midfield,Central Midfield,both,Football Club Internazionale Milano S.p.A.,2023,6000000,37000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,35.0
48,75615,Douglas Costa,1990-09-14,Brazil,Attack,Right Winger,left,Juventus Football Club,2020,1500000,55000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,33.0
101,73185,Lucas Pérez,1988-09-10,Spain,Attack,Centre-Forward,left,Cádiz Club de Fútbol S.A.D,2022,800000,17000000,premier-liga,Ukraine,2012-07-13,UKR1,Karpaty Lviv (-2021),35.0
289,42678,Salvatore Bocchetti,1986-11-30,Italy,Defender,Centre-Back,left,Verona Hellas Football Club,2020,200000,14000000,russian-super-cup,Russia,2012-07-14,RU1,FC Rubin Kazan,37.0


In [23]:
#Change order of the variables
desired_column_order1 = ['player_id','player_name', 'date_of_birth', 'age','country_of_citizenship', 'position', 'sub_position', 'foot', 'current_club_name', 
                        'last_season', 'market_value_in_eur', 'highest_market_value_in_eur', 'competition_name', 'competition_country', 'date',
                        'domestic_league_code', 'club_name']  

# Reorder the columns of the DataFrame
big5_df = big5_df[desired_column_order1]

In [24]:
big5_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 521278 entries, 29 to 1556968
Data columns (total 17 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   player_id                    521278 non-null  object        
 1   player_name                  521278 non-null  object        
 2   date_of_birth                521159 non-null  datetime64[ns]
 3   age                          521159 non-null  float64       
 4   country_of_citizenship       521278 non-null  object        
 5   position                     521278 non-null  object        
 6   sub_position                 521278 non-null  object        
 7   foot                         521278 non-null  object        
 8   current_club_name            521278 non-null  object        
 9   last_season                  521278 non-null  int64         
 10  market_value_in_eur          521278 non-null  int64         
 11  highest_market_value_in_eur  

#### Assign numerical ids to string data needed for correlation testing

competition_contry

In [25]:
#Define mapping of competition_countries to numerical IDs
country_mapping = {
    'Belgium': 1,
    'Denmark': 2,
    'England': 3,
    'France': 4,
    'Germany': 5,
    'Greece': 6,
    'Italy': 7,
    'Netherlands': 9,
    'Portugal': 10,
    'Russia': 11,
    'Scotland': 12,
    'Spain': 13,
    'Turkey': 14,
    'Ukraine': 15,
    'nan': 0
    }

In [26]:
#Apply mapping to create new variable 'countries#'
big5_df['comp_countries#'] = big5_df['competition_country'].map(country_mapping)

In [27]:
#check new variable and check accuracy
big5_df.head()

Unnamed: 0,player_id,player_name,date_of_birth,age,country_of_citizenship,position,sub_position,foot,current_club_name,last_season,market_value_in_eur,highest_market_value_in_eur,competition_name,competition_country,date,domestic_league_code,club_name,comp_countries#
29,26267,Fernandinho,1985-05-04,38.0,Brazil,Midfield,Defensive Midfield,right,Manchester City Football Club,2021,600000,32000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15
39,55735,Henrikh Mkhitaryan,1989-01-21,35.0,Armenia,Midfield,Central Midfield,both,Football Club Internazionale Milano S.p.A.,2023,6000000,37000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15
48,75615,Douglas Costa,1990-09-14,33.0,Brazil,Attack,Right Winger,left,Juventus Football Club,2020,1500000,55000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15
101,73185,Lucas Pérez,1988-09-10,35.0,Spain,Attack,Centre-Forward,left,Cádiz Club de Fútbol S.A.D,2022,800000,17000000,premier-liga,Ukraine,2012-07-13,UKR1,Karpaty Lviv (-2021),15
289,42678,Salvatore Bocchetti,1986-11-30,37.0,Italy,Defender,Centre-Back,left,Verona Hellas Football Club,2020,200000,14000000,russian-super-cup,Russia,2012-07-14,RU1,FC Rubin Kazan,11


'position'

In [28]:
#Define mapping of 'position' to numerical IDs
position_mapping = {'Defender': 1,
                'Midfield': 2,
                'Attack': 3,
                'Goalkeeper': 4,
                'Missing': 0,}

#Apply mapping to create new variable 'position#'
big5_df['position#'] = big5_df['position'].map(position_mapping)

#check new variable and check accuracy 
big5_df.head()

Unnamed: 0,player_id,player_name,date_of_birth,age,country_of_citizenship,position,sub_position,foot,current_club_name,last_season,market_value_in_eur,highest_market_value_in_eur,competition_name,competition_country,date,domestic_league_code,club_name,comp_countries#,position#
29,26267,Fernandinho,1985-05-04,38.0,Brazil,Midfield,Defensive Midfield,right,Manchester City Football Club,2021,600000,32000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,2
39,55735,Henrikh Mkhitaryan,1989-01-21,35.0,Armenia,Midfield,Central Midfield,both,Football Club Internazionale Milano S.p.A.,2023,6000000,37000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,2
48,75615,Douglas Costa,1990-09-14,33.0,Brazil,Attack,Right Winger,left,Juventus Football Club,2020,1500000,55000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,3
101,73185,Lucas Pérez,1988-09-10,35.0,Spain,Attack,Centre-Forward,left,Cádiz Club de Fútbol S.A.D,2022,800000,17000000,premier-liga,Ukraine,2012-07-13,UKR1,Karpaty Lviv (-2021),15,3
289,42678,Salvatore Bocchetti,1986-11-30,37.0,Italy,Defender,Centre-Back,left,Verona Hellas Football Club,2020,200000,14000000,russian-super-cup,Russia,2012-07-14,RU1,FC Rubin Kazan,11,1


 'sub_position'

In [29]:
#Define mapping of sub_position to numerical IDs
sub_position_mapping = {'Centre-Back': 1,
'Centre-Forward': 2,
'Central Midfield': 3,
'Defensive Midfield': 4,
'Goalkeeper': 5,
'Right-Back': 6,
'Left-Back': 7,
'Attacking Midfield': 8,
'Right Winger': 9,
'Left Winger': 10,
'Left Midfield': 11,
'Right Midfield': 12,
'Second Striker': 13,
}

#Apply mapping to create new variable 'sub_position#'
big5_df['sub_position#'] = big5_df['sub_position'].map(sub_position_mapping)

#check new variable and check accuracy 
big5_df.head()

Unnamed: 0,player_id,player_name,date_of_birth,age,country_of_citizenship,position,sub_position,foot,current_club_name,last_season,market_value_in_eur,highest_market_value_in_eur,competition_name,competition_country,date,domestic_league_code,club_name,comp_countries#,position#,sub_position#
29,26267,Fernandinho,1985-05-04,38.0,Brazil,Midfield,Defensive Midfield,right,Manchester City Football Club,2021,600000,32000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,2,4.0
39,55735,Henrikh Mkhitaryan,1989-01-21,35.0,Armenia,Midfield,Central Midfield,both,Football Club Internazionale Milano S.p.A.,2023,6000000,37000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,2,3.0
48,75615,Douglas Costa,1990-09-14,33.0,Brazil,Attack,Right Winger,left,Juventus Football Club,2020,1500000,55000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,3,9.0
101,73185,Lucas Pérez,1988-09-10,35.0,Spain,Attack,Centre-Forward,left,Cádiz Club de Fútbol S.A.D,2022,800000,17000000,premier-liga,Ukraine,2012-07-13,UKR1,Karpaty Lviv (-2021),15,3,2.0
289,42678,Salvatore Bocchetti,1986-11-30,37.0,Italy,Defender,Centre-Back,left,Verona Hellas Football Club,2020,200000,14000000,russian-super-cup,Russia,2012-07-14,RU1,FC Rubin Kazan,11,1,1.0


'foot'

In [30]:
#Define mapping of 'foot' to numerical IDs
foot_mapping = {'right': 1,
'left': 2,
'both' :3,}

#Apply mapping to create new variable 'foot#'
big5_df['foot#'] = big5_df['foot'].map(foot_mapping)

#check new variable and check accuracy 
big5_df.head()

Unnamed: 0,player_id,player_name,date_of_birth,age,country_of_citizenship,position,sub_position,foot,current_club_name,last_season,...,highest_market_value_in_eur,competition_name,competition_country,date,domestic_league_code,club_name,comp_countries#,position#,sub_position#,foot#
29,26267,Fernandinho,1985-05-04,38.0,Brazil,Midfield,Defensive Midfield,right,Manchester City Football Club,2021,...,32000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,2,4.0,1.0
39,55735,Henrikh Mkhitaryan,1989-01-21,35.0,Armenia,Midfield,Central Midfield,both,Football Club Internazionale Milano S.p.A.,2023,...,37000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,2,3.0,3.0
48,75615,Douglas Costa,1990-09-14,33.0,Brazil,Attack,Right Winger,left,Juventus Football Club,2020,...,55000000,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,3,9.0,2.0
101,73185,Lucas Pérez,1988-09-10,35.0,Spain,Attack,Centre-Forward,left,Cádiz Club de Fútbol S.A.D,2022,...,17000000,premier-liga,Ukraine,2012-07-13,UKR1,Karpaty Lviv (-2021),15,3,2.0,2.0
289,42678,Salvatore Bocchetti,1986-11-30,37.0,Italy,Defender,Centre-Back,left,Verona Hellas Football Club,2020,...,14000000,russian-super-cup,Russia,2012-07-14,RU1,FC Rubin Kazan,11,1,1.0,2.0


country_of_citizenship

In [31]:
#Define mapping of 'country_of_citizenship' to numerical IDs
country_of_citizenship_mapping = {'Albania': 1,
'Algeria': 2,
'Angola': 3,
'Argentina': 4,
'Armenia': 5,
'Australia': 6,
'Austria': 7,
'Belgium': 8,
'Benin': 9,
'Bosnia-Herzegovina': 10,
'Brazil': 11,
'Bulgaria': 12,
'Burkina Faso': 13,
'Burundi': 14,
'Cameroon': 15,
'Canada': 16,
'Cape Verde': 17,
'Central African Republic': 18,
'Chad': 19,
'Chile': 20,
'China': 21,
'Colombia': 22,
'Comoros': 23,
'Congo': 24,
'Costa Rica': 25,
'Cote dIvoire': 26,
'Croatia': 27,
'Cuba': 28,
'Cyprus': 29,
'Czech Republic': 30,
'Denmark': 31,
'Dominican Republic': 32,
'DR Congo': 33,
'Ecuador': 34,
'Egypt': 35,
'England': 36,
'Equatorial Guinea': 37,
'Estonia': 38,
'Faroe Islands': 39,
'Finland': 40,
'France': 41,
'French Guiana': 42,
'Gabon': 43,
'Georgia': 44,
'Germany': 45,
'Ghana': 46,
'Greece': 47,
'Grenada': 48,
'Guadeloupe': 49,
'Guinea': 50,
'Guinea-Bissau': 51,
'Haiti': 52,
'Honduras': 53,
'Hungary': 54,
'Iceland': 55,
'Iran': 56,
'Ireland': 57,
'Israel': 58,
'Italy': 59,
'Jamaica': 60,
'Japan': 61,
'Jordan': 62,
'Kenya': 63,
'Korea, South': 64,
'Kosovo': 65,
'Latvia': 66,
'Libya': 67,
'Lithuania': 68,
'Luxembourg': 69,
'Madagascar': 70,
'Mali': 71,
'Malta': 72,
'Martinique': 73,
'Mauritania': 74,
'Mexico': 75,
'Moldova': 76,
'Montenegro': 77,
'Montserrat': 78,
'Morocco': 79,
'Mozambique': 80,
'nan': 81,
'Netherlands': 82,
'Neukaledonien': 83,
'New Zealand': 84,
'Nigeria': 85,
'North Macedonia': 86,
'Northern Ireland': 87,
'Norway': 88,
'Panama': 89,
'Paraguay': 90,
'Peru': 91,
'Philippines': 92,
'Poland': 93,
'Portugal': 94,
'Romania': 95,
'Russia': 96,
'Scotland': 97,
'Senegal': 98,
'Serbia': 99,
'Sierra Leone': 100,
'Slovakia': 101,
'Slovenia': 102,
'South Africa': 103,
'Spain': 104,
'St. Kitts & Nevis': 105,
'Suriname': 106,
'Sweden': 107,
'Switzerland': 108,
'Syria': 109,
'The Gambia': 110,
'Togo': 111,
'Tunisia': 112,
'Turkey': 113,
'Türkiye': 114,
'Ukraine': 115,
'United States': 116,
'Uruguay': 117,
'Uzbekistan': 118,
'Venezuela': 119,
'Wales': 120,
'Zambia': 121,
'Zimbabwe': 122,
}

# Replace 'Cote d'Ivoire' with 'Cote dIvoire'
big5_df['country_of_citizenship'] = big5_df['country_of_citizenship'].replace("Cote d'Ivoire", "Cote dIvoire")

#Apply mapping to create new variable 'country_of_citizen#'
big5_df['country_of_citizen#'] = big5_df['country_of_citizenship'].map(country_of_citizenship_mapping)

#check new variable and check accuracy 
big5_df.head()

Unnamed: 0,player_id,player_name,date_of_birth,age,country_of_citizenship,position,sub_position,foot,current_club_name,last_season,...,competition_name,competition_country,date,domestic_league_code,club_name,comp_countries#,position#,sub_position#,foot#,country_of_citizen#
29,26267,Fernandinho,1985-05-04,38.0,Brazil,Midfield,Defensive Midfield,right,Manchester City Football Club,2021,...,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,2,4.0,1.0,11
39,55735,Henrikh Mkhitaryan,1989-01-21,35.0,Armenia,Midfield,Central Midfield,both,Football Club Internazionale Milano S.p.A.,2023,...,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,2,3.0,3.0,5
48,75615,Douglas Costa,1990-09-14,33.0,Brazil,Attack,Right Winger,left,Juventus Football Club,2020,...,ukrainian-super-cup,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,3,9.0,2.0,11
101,73185,Lucas Pérez,1988-09-10,35.0,Spain,Attack,Centre-Forward,left,Cádiz Club de Fútbol S.A.D,2022,...,premier-liga,Ukraine,2012-07-13,UKR1,Karpaty Lviv (-2021),15,3,2.0,2.0,104
289,42678,Salvatore Bocchetti,1986-11-30,37.0,Italy,Defender,Centre-Back,left,Verona Hellas Football Club,2020,...,russian-super-cup,Russia,2012-07-14,RU1,FC Rubin Kazan,11,1,1.0,2.0,59


'current_club_name'

In [32]:
#Define mapping of 'current_club_name' to numerical IDs
current_club_mapping = {'1. FC Union Berlin': 1,
'1. Fußball- und Sportverein Mainz 05': 2,
'1. Fußballclub Heidenheim 1846': 3,
'1. Fußball-Club Köln': 4,
'AC Ajaccio': 5,
'AJ Auxerre': 6,
'Angers SCO': 7,
'Arminia Bielefeld': 8,
'Arsenal Football Club': 9,
'AS Saint-Étienne': 10,
'Association Football Club Bournemouth': 11,
'Association sportive de Monaco Football Club': 12,
'Associazione Calcio Fiorentina': 13,
'Associazione Calcio Milan': 14,
'Associazione Calcio Monza': 15,
'Associazione Sportiva Roma': 16,
'Aston Villa Football Club': 17,
'Atalanta Bergamasca Calcio S.p.a.': 18,
'Athletic Club Bilbao': 19,
'Bayer 04 Leverkusen Fußball': 20,
'Benevento Calcio': 21,
'Bologna Football Club 1909': 22,
'Borussia Dortmund': 23,
'Borussia Verein für Leibesübungen 1900 Mönchengladbach': 24,
'Brentford Football Club': 25,
'Brighton and Hove Albion Football Club': 26,
'Burnley Football Club': 27,
'Cádiz Club de Fútbol S.A.D': 28,
'Cagliari Calcio': 29,
'Chelsea Football Club': 30,
'Clermont Foot 63': 31,
'Club Atlético de Madrid S.A.D.': 32,
'Club Atlético Osasuna': 33,
'Crystal Palace Football Club': 34,
'Deportivo Alavés S.A.D.': 35,
'Dijon FCO': 36,
'Eintracht Frankfurt Fußball AG': 37,
'Elche CF': 38,
'Empoli Football Club S.r.l.': 39,
'ESTAC Troyes': 40,
'Everton Football Club': 41,
'FC Augsburg 1907': 42,
'FC Bayern München': 43,
'FC Crotone': 44,
'FC Girondins Bordeaux': 45,
'FC Schalke 04': 46,
'Football Club de Metz': 47,
'Football Club de Nantes': 48,
'Football Club Internazionale Milano S.p.A.': 49,
'Football Club Lorient-Bretagne Sud': 50,
'Frosinone Calcio S.r.l.': 51,
'Fulham Football Club': 52,
'Futbol Club Barcelona': 53,
'Genoa Cricket and Football Club': 54,
'Getafe Club de Fútbol S.A.D. Team Dubai': 55,
'Girona Fútbol Club S. A. D.': 56,
'Granada Club de Fútbol S.A.D.': 57,
'Hertha BSC': 58,
'Juventus Football Club': 59,
'Le Havre Athletic Club': 60,
'Leeds United': 61,
'Leicester City': 62,
'Levante UD': 63,
'Lille Olympique Sporting Club Lille Métropole': 64,
'Liverpool Football Club': 65,
'Luton Town Football Club': 66,
'Manchester City Football Club': 67,
'Manchester United Football Club': 68,
'Montpellier Hérault Sport Club': 69,
'Newcastle United Football Club': 70,
'Nîmes Olympique': 71,
'Norwich City': 72,
'Nottingham Forest Football Club': 73,
'Olympique de Marseille': 74,
'Olympique Gymnaste Club Nice Côte dAzur': 75,
'Olympique Lyonnais': 76,
'Paris Saint-Germain Football Club': 77,
'Parma Calcio 1913': 78,
'Racing Club de Lens': 79,
'Racing Club de Strasbourg Alsace': 80,
'RasenBallsport Leipzig': 81,
'Rayo Vallecano de Madrid S.A.D.': 82,
'RCD Espanyol Barcelona': 83,
'Real Betis Balompié S.A.D.': 84,
'Real Club Celta de Vigo S. A. D.': 85,
'Real Club Deportivo Mallorca S.A.D.': 86,
'Real Madrid Club de Fútbol': 87,
'Real Sociedad de Fútbol S.A.D.': 88,
'Real Valladolid CF': 89,
'SD Eibar': 90,
'SD Huesca': 91,
'Sevilla Fútbol Club S.A.D.': 92,
'Sheffield United Football Club': 93,
'Società Sportiva Calcio Napoli': 94,
'Società Sportiva Lazio S.p.A.': 95,
'Southampton FC': 96,
'Spezia Calcio': 97,
'Sport-Club Freiburg': 98,
'Sportverein Darmstadt 1898 e. V.': 99,
'Sportverein Werder Bremen von 1899': 100,
'SpVgg Greuther Fürth': 101,
'Stade brestois 29': 102,
'Stade de Reims': 103,
'Stade Rennais Football Club': 104,
'Torino Calcio': 105,
'Tottenham Hotspur Football Club': 106,
'Toulouse Football Club': 107,
'TSG 1899 Hoffenheim Fußball-Spielbetriebs GmbH': 108,
'U.S. Salernitana 1919 S.r.l.': 109,
'UC Sampdoria': 110,
'Udinese Calcio': 111,
'Unión Deportiva Almería S.A.D.': 112,
'Unión Deportiva Las Palmas S.A.D.': 113,
'Unione Sportiva Lecce': 114,
'Unione Sportiva Sassuolo Calcio': 115,
'US Cremonese': 116,
'Valencia Club de Fútbol S. A. D.': 117,
'Venezia FC': 118,
'Verein für Bewegungsspiele Stuttgart 1893': 119,
'Verein für Leibesübungen Bochum 1848 – Fußballgemeinschaft': 120,
'Verein für Leibesübungen Wolfsburg': 121,
'Verona Hellas Football Club': 122,
'Villarreal Club de Fútbol S.A.D.': 123,
'Watford FC': 124,
'West Bromwich Albion': 125,
'West Ham United Football Club': 126,
'Wolverhampton Wanderers Football Club': 127,
 }

# Replace 'Olympique Gymnaste Club Nice Côte d'Azur' with 'Olympique Gymnaste Club Nice Côte dAzur'
big5_df['current_club_name'] = big5_df['current_club_name'].replace("Olympique Gymnaste Club Nice Côte d'Azur", 
                                                                              "Olympique Gymnaste Club Nice Côte dAzur")

#Apply mapping to create new variable ''current_club_name#'
big5_df['current_club_name#'] = big5_df['current_club_name'].map(current_club_mapping)

#check new variable and check accuracy 
big5_df.head()

Unnamed: 0,player_id,player_name,date_of_birth,age,country_of_citizenship,position,sub_position,foot,current_club_name,last_season,...,competition_country,date,domestic_league_code,club_name,comp_countries#,position#,sub_position#,foot#,country_of_citizen#,current_club_name#
29,26267,Fernandinho,1985-05-04,38.0,Brazil,Midfield,Defensive Midfield,right,Manchester City Football Club,2021,...,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,2,4.0,1.0,11,67
39,55735,Henrikh Mkhitaryan,1989-01-21,35.0,Armenia,Midfield,Central Midfield,both,Football Club Internazionale Milano S.p.A.,2023,...,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,2,3.0,3.0,5,49
48,75615,Douglas Costa,1990-09-14,33.0,Brazil,Attack,Right Winger,left,Juventus Football Club,2020,...,Ukraine,2012-07-09,UKR1,FC Shakhtar Donetsk,15,3,9.0,2.0,11,59
101,73185,Lucas Pérez,1988-09-10,35.0,Spain,Attack,Centre-Forward,left,Cádiz Club de Fútbol S.A.D,2022,...,Ukraine,2012-07-13,UKR1,Karpaty Lviv (-2021),15,3,2.0,2.0,104,28
289,42678,Salvatore Bocchetti,1986-11-30,37.0,Italy,Defender,Centre-Back,left,Verona Hellas Football Club,2020,...,Russia,2012-07-14,RU1,FC Rubin Kazan,11,1,1.0,2.0,59,122


'club_name'

In [33]:
#Define mapping of 'club_name' to numerical IDs
club_name_mapping = {'1. FC Union Berlin': 1,
'1. Fußball- und Sportverein Mainz 05': 2,
'1. Fußballclub Heidenheim 1846': 3,
'1. Fußball-Club Köln': 4,
'1.FC Nuremberg': 5,
'A.G.S Asteras Tripolis': 6,
'Aalborg BK': 7,
'Aarhus Gymnastik Forening': 8,
'Aberdeen Football Club': 9,
'AC Ajaccio': 10,
'AC Carpi': 11,
'AC Horsens': 12,
'Académica Coimbra': 13,
'Adana Demirspor Kulübü': 14,
'ADO Den Haag': 15,
'AE Larisa': 16,
'AFC Ajax Amsterdam': 17,
'AJ Auxerre': 18,
'Alanyaspor': 19,
'Alkmaar Zaanstreek': 20,
'Amiens SC': 21,
'Angers SCO': 22,
'Antalyaspor': 23,
'Anzhi Makhachkala ( -2022)': 24,
'AO FK Zenit Sankt-Peterburg': 25,
'AO Platanias': 26,
'APO Levadiakos': 27,
'APS Atromitos Athinon': 28,
'Arminia Bielefeld': 29,
'Arsenal Football Club': 30,
'Arsenal Kyiv': 31,
'Arsenal Tula': 32,
'AS Nancy-Lorraine': 33,
'AS Saint-Étienne': 34,
'Association Football Club Bournemouth': 35,
'Association sportive de Monaco Football Club': 36,
'Associazione Calcio Fiorentina': 37,
'Associazione Calcio Milan': 38,
'Associazione Calcio Monza': 39,
'Associazione Sportiva Roma': 40,
'Aston Villa Football Club': 41,
'Atalanta Bergamasca Calcio S.p.a.': 42,
'Athletic Club Bilbao': 43,
'Athlitiki Enosi Konstantinoupoleos': 44,
'B SAD': 45,
'Bayer 04 Leverkusen Fußball': 46,
'Beerschot V.A.': 47,
'Benevento Calcio': 48,
'Beşiktaş Jimnastik Kulübü': 49,
'Boavista Futebol Clube': 50,
'Bologna Football Club 1909': 51,
'Borussia Dortmund': 52,
'Borussia Verein für Leibesübungen 1900 Mönchengladbach': 53,
'Brentford Football Club': 54,
'Brescia Calcio': 55,
'Brighton and Hove Albion Football Club': 56,
'Brøndby Idrætsforening': 57,
'Burnley Football Club': 58,
'Bursaspor': 59,
'Cádiz Club de Fútbol S.A.D': 60,
'Cagliari Calcio': 61,
'Cardiff City': 62,
'Catania FC': 63,
'Çaykur Rizespor Kulübü': 64,
'CD Leganés': 65,
'CD Nacional': 66,
'CD Tondela': 67,
'Cercle Brugge Koninklijke Sportvereniging': 68,
'Cesena FC': 69,
'CF Os Belenenses': 70,
'Chelsea Football Club': 71,
'Chievo Verona': 72,
'Clermont Foot 63': 73,
'Club Atlético de Madrid S.A.D.': 74,
'Club Atlético Osasuna': 75,
'Club Brugge Koninklijke Voetbalvereniging': 76,
'Córdoba CF': 77,
'Crystal Palace Football Club': 78,
'CS Marítimo': 79,
'De Graafschap Doetinchem': 80,
'Delfino Pescara 1936': 81,
'Deportivo Alavés S.A.D.': 82,
'Deportivo de La Coruña': 83,
'Desna Chernigiv': 84,
'Desportivo Aves (- 2020)': 85,
'Dijon FCO': 86,
'Dnipro Dnipropetrovsk (-2020)': 87,
'Dundee Football Club': 88,
'Dundee United FC': 89,
'EA Guingamp': 90,
'Eindhovense Voetbalvereniging Philips Sport Vereniging': 91,
'Eintracht Braunschweig': 92,
'Eintracht Frankfurt Fußball AG': 93,
'Elazigspor': 94,
'Elche CF': 95,
'Empoli Football Club S.r.l.': 96,
'Erzurumspor FK': 97,
'Esbjerg fB': 98,
'ESTAC Troyes': 99,
'Everton Football Club': 100,
'Excelsior Rotterdam': 101,
'Fatih Karagümrük Sportif Faaliyetler San. Tic. A.Ş.': 102,
'FC Augsburg 1907': 103,
'FC Bayern München': 104,
'FC Crotone': 105,
'FC Dordrecht': 106,
'FC Emmen': 107,
'FC Girondins Bordeaux': 108,
'FC Groningen': 109,
'FC Ingolstadt 04': 110,
'FC Orenburg': 111,
'FC Paços de Ferreira': 112,
'FC Rubin Kazan': 113,
'FC Schalke 04': 114,
'FC Shakhtar Donetsk': 115,
'FC Sochaux-Montbéliard': 116,
'Fenerbahçe Spor Kulübü': 117,
'Feyenoord Rotterdam': 118,
'FK Dinamo Moskva': 119,
'FK Khimki': 120,
'FK Krasnodar': 121,
'FK Mariupol': 122,
'FK Oleksandriya': 123,
'FK Rostov': 124,
'FK Sevastopol (- 2014)': 125,
'FK Sochi': 126,
'FK Spartak Moskva': 127,
'FK Ufa': 128,
'FK Ural Yekaterinburg': 129,
'FK Zarya Lugansk': 130,
'Fodbold Club Midtjylland': 131,
'Fodbold Club Nordsjælland': 132,
'Football Club de Metz': 133,
'Football Club de Nantes': 134,
'Football Club Internazionale Milano S.p.A.': 135,
'Football Club København': 136,
'Football Club Lorient-Bretagne Sud': 137,
'Football Club Twente': 138,
'Football Club Utrecht': 139,
'Football Club Volendam': 140,
'Fortuna Düsseldorf': 141,
'Fortuna Sittardia Combinatie': 142,
'Frosinone Calcio S.r.l.': 143,
'Fulham Football Club': 144,
'Futbol Club Barcelona': 145,
'Futbolniy Klub Dynamo Kyiv': 146,
'Futebol Clube de Arouca': 147,
'Futebol Clube de Famalicão': 148,
'Futebol Clube de Vizela': 149,
'Futebol Clube do Porto': 150,
'Galatasaray Spor Kulübü': 151,
'Gaziantep Futbol Kulübü A.Ş.': 152,
'Gaziantepspor (- 2020)': 153,
'Genclerbirligi Ankara': 154,
'Genoa Cricket and Football Club': 155,
'Getafe Club de Fútbol S.A.D. Team Dubai': 156,
'GFC Ajaccio': 157,
'Gil Vicente Futebol Clube': 158,
'Girona Fútbol Club S. A. D.': 159,
'Go Ahead Eagles': 160,
'Granada Club de Fútbol S.A.D.': 161,
'Grupo Desportivo de Chaves': 162,
'Grupo Desportivo Estoril Praia': 163,
'GS Ergotelis': 164,
'Hamburger SV': 165,
'Hamilton Academical FC': 166,
'Hannover 96': 167,
'Hatayspor Futbol Kulübü': 168,
'Heart of Midlothian Football Club': 169,
'Heracles Almelo': 170,
'Hertha BSC': 171,
'Hibernian Football Club': 172,
'Hobro IK': 173,
'Huddersfield Town': 174,
'Hull City': 175,
'Inverness Caledonian Thistle FC': 176,
'İstanbul Başakşehir Futbol Kulübü': 177,
'Juventus Football Club': 178,
'Kardemir Karabükspor': 179,
'Karpaty Lviv (-2021)': 180,
'Kasımpaşa Spor Kulübü': 181,
'Kayseri Erciyesspor': 182,
'Kayserispor Kulübü': 183,
'Kilmarnock Football Club': 184,
'Königliche Allgemeine Sportvereinigung Eupen': 185,
'Koninklijke Atletiek Associatie Gent': 186,
'Koninklijke Racing Club Genk': 187,
'Koninklijke Sint-Truidense Voetbalvereniging': 188,
'Koninklijke Voetbal Club Westerlo': 189,
'Koninklijke Voetbalclub Kortrijk': 190,
'Konyaspor': 191,
'KV Oostende': 192,
'Le Havre Athletic Club': 193,
'Leeds United': 194,
'Leicester City': 195,
'Levante UD': 196,
'Lierse SK (- 2018)': 197,
'Lille Olympique Sporting Club Lille Métropole': 198,
'Liverpool Football Club': 199,
'Livingston Football Club': 200,
'Luton Town Football Club': 201,
'Lyngby Boldklubben af 1921': 202,
'Makina ve Kimya Endüstrisi Ankaragücü Spor Kulübü': 203,
'Málaga CF': 204,
'Manchester City Football Club': 205,
'Manchester United Football Club': 206,
'Metalist Kharkiv (- 2016)': 207,
'Middlesbrough FC': 208,
'Montpellier Hérault Sport Club': 209,
'Moreirense Futebol Clube': 210,
'Motherwell Football Club': 211,
'NAC Breda': 212,
'Neos Podosferikos Syllogos Volou': 213,
'Newcastle United Football Club': 214,
'Nijmegen Eendracht Combinatie': 215,
'Nîmes Olympique': 216,
'Norwich City': 217,
'Nottingham Forest Football Club': 218,
'Odense Boldklub': 219,
'Olympiakos Syndesmos Filathlon Peiraios': 220,
'Olympique de Marseille': 221,
'Olympique Gymnaste Club Nice Côte dAzur': 222,
'Olympique Lyonnais': 223,
'Omilos Filathlon Irakliou FC': 224,
'Oud-Heverlee Leuven': 225,
'Palermo FC': 226,
'Panathinaikos Athlitikos Omilos': 227,
'Panipirotikos Athlitikos Syllogos Giannina': 228,
'Panthessalonikios Athlitikos Omilos Konstantinoupoliton': 229,
'Paris Saint-Germain Football Club': 230,
'Parma Calcio 1913': 231,
'Partick Thistle FC': 232,
'PFK CSKA Moskva': 233,
'PFK Krylya Sovetov Samara': 234,
'PFK Tambov (-2021)': 235,
'Portimonense Futebol SAD': 236,
'Prins Hendrik Ende Desespereert Nimmer Combinatie Zwolle': 237,
'Queens Park Rangers': 238,
'Racing Club de Lens': 239,
'Racing Club de Strasbourg Alsace': 240,
'Rangers Football Club': 241,
'RasenBallsport Leipzig': 242,
'Rayo Vallecano de Madrid S.A.D.': 243,
'RCD Espanyol Barcelona': 244,
'Reading FC': 245,
'Real Betis Balompié S.A.D.': 246,
'Real Club Celta de Vigo S. A. D.': 247,
'Real Club Deportivo Mallorca S.A.D.': 248,
'Real Madrid Club de Fútbol': 249,
'Real Sociedad de Fútbol S.A.D.': 250,
'Real Valladolid CF': 251,
'Real Zaragoza': 252,
'RFC Seraing': 253,
'RFK Akhmat Grozny': 254,
'Rio Ave Futebol Clube': 255,
'Roda JC Kerkrade': 256,
'Rooms Katholieke Combinatie Waalwijk': 257,
'Ross County Football Club': 258,
'Royal Antwerp Football Club': 259,
'Royal Charleroi Sporting Club': 260,
'Royal Excel Mouscron (-2022)': 261,
'Royal Sporting Club Anderlecht': 262,
'Royal Standard Club de Liège': 263,
'Royale Union Saint-Gilloise': 264,
'Saint Johnstone Football Club': 265,
'Saint Mirren Football Club': 266,
'SC Bastia': 267,
'SC Cambuur Leeuwarden': 268,
'SC Olhanense': 269,
'SC Paderborn 07': 270,
'SD Eibar': 271,
'SD Huesca': 272,
'Sevilla Fútbol Club S.A.D.': 273,
'Sheffield United Football Club': 274,
'Siena FC': 275,
'Silkeborg Idrætsforening': 276,
'Sivasspor Kulübü': 277,
'SK Beveren': 278,
'SM Caen': 279,
'Società Sportiva Calcio Napoli': 280,
'Società Sportiva Lazio S.p.A.': 281,
'SönderjyskE': 282,
'Southampton FC': 283,
'SPAL': 284,
'Sparta Rotterdam': 285,
'Spezia Calcio': 286,
'Sport Klub Dnipro-1': 287,
'Sport Lisboa e Benfica': 288,
'Sport-Club Freiburg': 289,
'Sportclub Heerenveen': 290,
'Sporting Clube de Braga': 291,
'Sporting Clube de Portugal': 292,
'Sporting Gijón': 293,
'Sportverein Darmstadt 1898 e. V.': 294,
'Sportverein Werder Bremen von 1899': 295,
'SpVgg Greuther Fürth': 296,
'Stade brestois 29': 297,
'Stade de Reims': 298,
'Stade Rennais Football Club': 299,
'Stichting Betaald Voetbal Vitesse Arnhem': 300,
'Stoke City': 301,
'Sunderland AFC': 302,
'SV Zulte Waregem': 303,
'Swansea City': 304,
'The Celtic Football Club': 305,
'Thonon Évian Grand Genève FC': 306,
'Torino Calcio': 307,
'Tottenham Hotspur Football Club': 308,
'Toulouse Football Club': 309,
'Trabzonspor Kulübü': 310,
'TSG 1899 Hoffenheim Fußball-Spielbetriebs GmbH': 311,
'U.S. Salernitana 1919 S.r.l.': 312,
'UC Sampdoria': 313,
'Udinese Calcio': 314,
'Unión Deportiva Almería S.A.D.': 315,
'Unión Deportiva Las Palmas S.A.D.': 316,
'Unione Sportiva Lecce': 317,
'Unione Sportiva Sassuolo Calcio': 318,
'US Cremonese': 319,
'US Livorno 1915': 320,
'Valencia Club de Fútbol S. A. D.': 321,
'Valenciennes FC': 322,
'Vejle Boldklub': 323,
'Vendsyssel FF': 324,
'Venezia FC': 325,
'Verein für Bewegungsspiele Stuttgart 1893': 326,
'Verein für Leibesübungen Bochum 1848 – Fußballgemeinschaft': 327,
'Verein für Leibesübungen Wolfsburg': 328,
'Veria NPS': 329,
'Verona Hellas Football Club': 330,
'Villarreal Club de Fútbol S.A.D.': 331,
'Vitória Setúbal FC': 332,
'Vitória Sport Clube': 333,
'Vorskla Poltava': 334,
'VVV-Venlo': 335,
'Watford FC': 336,
'West Bromwich Albion': 337,
'West Ham United Football Club': 338,
'Wigan Athletic': 339,
'Willem II Tilburg': 340,
'Wolverhampton Wanderers Football Club': 341,
'Yellow-Red Koninklijke Voetbalclub Mechelen': 342,
'Yeni Malatyaspor': 343,
'Футбольный клуб "Локомотив" Москва': 344,
}

# Replace 'Olympique Gymnaste Club Nice Côte d'Azur' with 'Olympique Gymnaste Club Nice Côte dAzur'
big5_df['club_name'] = big5_df['club_name'].replace("Olympique Gymnaste Club Nice Côte d'Azur", 
                                                                              "Olympique Gymnaste Club Nice Côte dAzur")
#Apply mapping to create new variable 'position#'
big5_df['club_name#'] = big5_df['club_name'].map(club_name_mapping)

#check new variable and check accuracy 
big5_df.head()

Unnamed: 0,player_id,player_name,date_of_birth,age,country_of_citizenship,position,sub_position,foot,current_club_name,last_season,...,date,domestic_league_code,club_name,comp_countries#,position#,sub_position#,foot#,country_of_citizen#,current_club_name#,club_name#
29,26267,Fernandinho,1985-05-04,38.0,Brazil,Midfield,Defensive Midfield,right,Manchester City Football Club,2021,...,2012-07-09,UKR1,FC Shakhtar Donetsk,15,2,4.0,1.0,11,67,115
39,55735,Henrikh Mkhitaryan,1989-01-21,35.0,Armenia,Midfield,Central Midfield,both,Football Club Internazionale Milano S.p.A.,2023,...,2012-07-09,UKR1,FC Shakhtar Donetsk,15,2,3.0,3.0,5,49,115
48,75615,Douglas Costa,1990-09-14,33.0,Brazil,Attack,Right Winger,left,Juventus Football Club,2020,...,2012-07-09,UKR1,FC Shakhtar Donetsk,15,3,9.0,2.0,11,59,115
101,73185,Lucas Pérez,1988-09-10,35.0,Spain,Attack,Centre-Forward,left,Cádiz Club de Fútbol S.A.D,2022,...,2012-07-13,UKR1,Karpaty Lviv (-2021),15,3,2.0,2.0,104,28,180
289,42678,Salvatore Bocchetti,1986-11-30,37.0,Italy,Defender,Centre-Back,left,Verona Hellas Football Club,2020,...,2012-07-14,RU1,FC Rubin Kazan,11,1,1.0,2.0,59,122,113


In [34]:
big5_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 521278 entries, 29 to 1556968
Data columns (total 24 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   player_id                    521278 non-null  object        
 1   player_name                  521278 non-null  object        
 2   date_of_birth                521159 non-null  datetime64[ns]
 3   age                          521159 non-null  float64       
 4   country_of_citizenship       521278 non-null  object        
 5   position                     521278 non-null  object        
 6   sub_position                 521278 non-null  object        
 7   foot                         521278 non-null  object        
 8   current_club_name            521278 non-null  object        
 9   last_season                  521278 non-null  int64         
 10  market_value_in_eur          521278 non-null  int64         
 11  highest_market_value_in_eur  

#### Check missing data

In [35]:
#check for missing values
big5_df.isnull().sum()

player_id                         0
player_name                       0
date_of_birth                   119
age                             119
country_of_citizenship            0
position                          0
sub_position                      0
foot                              0
current_club_name                 0
last_season                       0
market_value_in_eur               0
highest_market_value_in_eur       0
competition_name                  0
competition_country               0
date                              0
domestic_league_code              0
club_name                         0
comp_countries#                   0
position#                         0
sub_position#                    46
foot#                          1210
country_of_citizen#               0
current_club_name#                0
club_name#                        0
dtype: int64

In [36]:
age_nan = big5_df[big5_df['age'].isnull()==True]
age_nan

Unnamed: 0,player_id,player_name,date_of_birth,age,country_of_citizenship,position,sub_position,foot,current_club_name,last_season,...,date,domestic_league_code,club_name,comp_countries#,position#,sub_position#,foot#,country_of_citizen#,current_club_name#,club_name#
5601,58124,Julian Schieber,NaT,,Germany,Attack,Centre-Forward,left,FC Augsburg 1907,2020,...,2012-08-12,L1,Borussia Dortmund,5,3,2.0000,2.0000,45,42,52
5998,58124,Julian Schieber,NaT,,Germany,Attack,Centre-Forward,left,FC Augsburg 1907,2020,...,2012-08-18,L1,Borussia Dortmund,5,3,2.0000,2.0000,45,42,52
14540,58124,Julian Schieber,NaT,,Germany,Attack,Centre-Forward,left,FC Augsburg 1907,2020,...,2012-09-01,L1,Borussia Dortmund,5,3,2.0000,2.0000,45,42,52
20068,58124,Julian Schieber,NaT,,Germany,Attack,Centre-Forward,left,FC Augsburg 1907,2020,...,2012-09-18,,Borussia Dortmund,0,3,2.0000,2.0000,45,42,52
21709,58124,Julian Schieber,NaT,,Germany,Attack,Centre-Forward,left,FC Augsburg 1907,2020,...,2012-09-22,L1,Borussia Dortmund,5,3,2.0000,2.0000,45,42,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
903868,58124,Julian Schieber,NaT,,Germany,Attack,Centre-Forward,left,FC Augsburg 1907,2020,...,2019-05-05,L1,FC Augsburg 1907,5,3,2.0000,2.0000,45,42,103
908252,58124,Julian Schieber,NaT,,Germany,Attack,Centre-Forward,left,FC Augsburg 1907,2020,...,2019-05-18,L1,FC Augsburg 1907,5,3,2.0000,2.0000,45,42,103
916113,58124,Julian Schieber,NaT,,Germany,Attack,Centre-Forward,left,FC Augsburg 1907,2020,...,2019-08-10,L1,FC Augsburg 1907,5,3,2.0000,2.0000,45,42,103
937823,58124,Julian Schieber,NaT,,Germany,Attack,Centre-Forward,left,FC Augsburg 1907,2020,...,2019-09-28,L1,FC Augsburg 1907,5,3,2.0000,2.0000,45,42,103


In [37]:
#missing date_of_birth and ages can be droped
big5_df.dropna(subset=['age', 'date_of_birth'],inplace = True)

In [38]:
sub_position_nan = big5_df[big5_df['sub_position#'].isnull()==True]
sub_position_nan
#missing sub_position data can be deleted

Unnamed: 0,player_id,player_name,date_of_birth,age,country_of_citizenship,position,sub_position,foot,current_club_name,last_season,...,date,domestic_league_code,club_name,comp_countries#,position#,sub_position#,foot#,country_of_citizen#,current_club_name#,club_name#
994664,131480,Il-lok Yun,1992-03-07,32.0,"Korea, South",Missing,,right,Montpellier Hérault Sport Club,2020,...,2020-02-05,FR1,Montpellier Hérault Sport Club,4,0,,1.0,64,69,209
998010,131480,Il-lok Yun,1992-03-07,32.0,"Korea, South",Missing,,right,Montpellier Hérault Sport Club,2020,...,2020-02-14,FR1,Montpellier Hérault Sport Club,4,0,,1.0,64,69,209
1001654,131480,Il-lok Yun,1992-03-07,32.0,"Korea, South",Missing,,right,Montpellier Hérault Sport Club,2020,...,2020-02-22,FR1,Montpellier Hérault Sport Club,4,0,,1.0,64,69,209
1005113,131480,Il-lok Yun,1992-03-07,32.0,"Korea, South",Missing,,right,Montpellier Hérault Sport Club,2020,...,2020-02-29,FR1,Montpellier Hérault Sport Club,4,0,,1.0,64,69,209
1010111,131480,Il-lok Yun,1992-03-07,32.0,"Korea, South",Missing,,right,Montpellier Hérault Sport Club,2020,...,2020-03-08,FR1,Montpellier Hérault Sport Club,4,0,,1.0,64,69,209
1048928,158704,Jon Ander Garrido,1989-10-09,34.0,Spain,Missing,,right,Cádiz Club de Fútbol S.A.D,2022,...,2020-10-01,ES1,Cádiz Club de Fútbol S.A.D,13,0,,1.0,104,28,60
1054106,158704,Jon Ander Garrido,1989-10-09,34.0,Spain,Missing,,right,Cádiz Club de Fútbol S.A.D,2022,...,2020-10-17,ES1,Cádiz Club de Fútbol S.A.D,13,0,,1.0,104,28,60
1058855,131480,Il-lok Yun,1992-03-07,32.0,"Korea, South",Missing,,right,Montpellier Hérault Sport Club,2020,...,2020-10-25,FR1,Montpellier Hérault Sport Club,4,0,,1.0,64,69,209
1062022,158704,Jon Ander Garrido,1989-10-09,34.0,Spain,Missing,,right,Cádiz Club de Fútbol S.A.D,2022,...,2020-10-30,ES1,Cádiz Club de Fútbol S.A.D,13,0,,1.0,104,28,60
1063597,131480,Il-lok Yun,1992-03-07,32.0,"Korea, South",Missing,,right,Montpellier Hérault Sport Club,2020,...,2020-11-01,FR1,Montpellier Hérault Sport Club,4,0,,1.0,64,69,209


In [39]:
#missing sub_position data can be deleted
big5_df.dropna(subset=['sub_position#'],inplace = True)

In [40]:
foot_nan = big5_df[big5_df['foot#'].isnull()==True]
foot_nan
#missing foot data can be deleted

Unnamed: 0,player_id,player_name,date_of_birth,age,country_of_citizenship,position,sub_position,foot,current_club_name,last_season,...,date,domestic_league_code,club_name,comp_countries#,position#,sub_position#,foot#,country_of_citizen#,current_club_name#,club_name#
8197,45882,Fabri,1987-12-31,36.0000,Spain,Goalkeeper,Goalkeeper,,Fulham Football Club,2020,...,2012-08-19,ES1,Real Betis Balompié S.A.D.,13,4,5.0000,,104,52,246
10188,77757,Thomas Kaminski,1992-10-23,31.0000,Belgium,Goalkeeper,Goalkeeper,,Luton Town Football Club,2023,...,2012-08-25,BE1,Royal Sporting Club Anderlecht,1,4,5.0000,,8,66,262
10786,45882,Fabri,1987-12-31,36.0000,Spain,Goalkeeper,Goalkeeper,,Fulham Football Club,2020,...,2012-08-25,ES1,Real Betis Balompié S.A.D.,13,4,5.0000,,104,52,246
149067,77757,Thomas Kaminski,1992-10-23,31.0000,Belgium,Goalkeeper,Goalkeeper,,Luton Town Football Club,2023,...,2013-09-14,BE1,Royal Sporting Club Anderlecht,1,4,5.0000,,8,66,262
160126,77757,Thomas Kaminski,1992-10-23,31.0000,Belgium,Goalkeeper,Goalkeeper,,Luton Town Football Club,2023,...,2013-09-29,BE1,Royal Sporting Club Anderlecht,1,4,5.0000,,8,66,262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555784,807165,Bamo Meïté,2001-12-03,22.0000,Cote dIvoire,Defender,Centre-Back,,Olympique de Marseille,2023,...,2024-03-17,FR1,Olympique de Marseille,4,1,1.0000,,26,74,221
1555804,619461,Mohamed Bamba,2001-12-10,22.0000,Cote dIvoire,Attack,Centre-Forward,,Football Club Lorient-Bretagne Sud,2023,...,2024-03-17,FR1,Football Club Lorient-Bretagne Sud,4,3,2.0000,,26,50,137
1555820,1028162,Antoine Joujou,2003-03-12,21.0000,France,Attack,Centre-Forward,,Le Havre Athletic Club,2023,...,2024-03-17,FR1,Le Havre Athletic Club,4,3,2.0000,,41,60,193
1555852,1178474,Amadou Koné,2005-05-14,18.0000,Cote dIvoire,Midfield,Attacking Midfield,,Stade de Reims,2023,...,2024-03-17,FR1,Stade de Reims,4,2,8.0000,,26,103,298


In [41]:
#missing foot data can be deleted
big5_df.dropna(subset=['foot#'],inplace = True)

In [42]:
#check for missing values
big5_df.isnull().sum()

player_id                      0
player_name                    0
date_of_birth                  0
age                            0
country_of_citizenship         0
position                       0
sub_position                   0
foot                           0
current_club_name              0
last_season                    0
market_value_in_eur            0
highest_market_value_in_eur    0
competition_name               0
competition_country            0
date                           0
domestic_league_code           0
club_name                      0
comp_countries#                0
position#                      0
sub_position#                  0
foot#                          0
country_of_citizen#            0
current_club_name#             0
club_name#                     0
dtype: int64

##### Check for mixed types and data type changes

In [43]:
#check for mixed type data
for col in big5_df.columns.tolist():
      weird = (big5_df[[col]].map(type) != big5_df[[col]].iloc[0].apply(type)).any(axis = 1)
      if len (big5_df[weird]) > 0:
        print (col)

In [44]:
#change float to integer
big5_df[['age','sub_position#','foot#','country_of_citizen#']] = big5_df[['age','sub_position#','foot#','country_of_citizen#']].astype('int64')
big5_df[['age','sub_position#','foot#','country_of_citizen#']].dtypes

age                    int64
sub_position#          int64
foot#                  int64
country_of_citizen#    int64
dtype: object

In [45]:
big5_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 519905 entries, 29 to 1556968
Data columns (total 24 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   player_id                    519905 non-null  object        
 1   player_name                  519905 non-null  object        
 2   date_of_birth                519905 non-null  datetime64[ns]
 3   age                          519905 non-null  int64         
 4   country_of_citizenship       519905 non-null  object        
 5   position                     519905 non-null  object        
 6   sub_position                 519905 non-null  object        
 7   foot                         519905 non-null  object        
 8   current_club_name            519905 non-null  object        
 9   last_season                  519905 non-null  int64         
 10  market_value_in_eur          519905 non-null  int64         
 11  highest_market_value_in_eur  

In [46]:
big5_df[['competition_country']].value_counts()

competition_country
Spain                  95988
England                90807
Italy                  88507
Germany                76429
France                 72224
nan                    46537
Netherlands            13046
Belgium                 9495
Portugal                8307
Turkey                  4209
Denmark                 3833
Russia                  3581
Scotland                3386
Greece                  1884
Ukraine                 1672
Name: count, dtype: int64

In [47]:
#change float to integer
big5_df[['comp_countries#']] = big5_df[['comp_countries#']].astype('int64')
big5_df[['comp_countries#']].dtypes

comp_countries#    int64
dtype: object

In [48]:
#check for missing values
big5_df.isnull().sum()

player_id                      0
player_name                    0
date_of_birth                  0
age                            0
country_of_citizenship         0
position                       0
sub_position                   0
foot                           0
current_club_name              0
last_season                    0
market_value_in_eur            0
highest_market_value_in_eur    0
competition_name               0
competition_country            0
date                           0
domestic_league_code           0
club_name                      0
comp_countries#                0
position#                      0
sub_position#                  0
foot#                          0
country_of_citizen#            0
current_club_name#             0
club_name#                     0
dtype: int64

In [49]:
#Change order of the variables
desired_column_order = ['player_id','player_name', 'date_of_birth', 'age','country_of_citizenship', 'country_of_citizen#','position', 'position#','sub_position', 'sub_position#','foot', 'foot#', 'current_club_name','current_club_name#', 
                        'last_season', 'market_value_in_eur', 'highest_market_value_in_eur', 'competition_name', 'competition_country', 'comp_countries#', 'date',
                        'domestic_league_code', 'club_name', 'club_name#']  

# Reorder the columns of the DataFrame
big5_df = big5_df[desired_column_order]

In [50]:
big5_df.head()

Unnamed: 0,player_id,player_name,date_of_birth,age,country_of_citizenship,country_of_citizen#,position,position#,sub_position,sub_position#,...,last_season,market_value_in_eur,highest_market_value_in_eur,competition_name,competition_country,comp_countries#,date,domestic_league_code,club_name,club_name#
29,26267,Fernandinho,1985-05-04,38,Brazil,11,Midfield,2,Defensive Midfield,4,...,2021,600000,32000000,ukrainian-super-cup,Ukraine,15,2012-07-09,UKR1,FC Shakhtar Donetsk,115
39,55735,Henrikh Mkhitaryan,1989-01-21,35,Armenia,5,Midfield,2,Central Midfield,3,...,2023,6000000,37000000,ukrainian-super-cup,Ukraine,15,2012-07-09,UKR1,FC Shakhtar Donetsk,115
48,75615,Douglas Costa,1990-09-14,33,Brazil,11,Attack,3,Right Winger,9,...,2020,1500000,55000000,ukrainian-super-cup,Ukraine,15,2012-07-09,UKR1,FC Shakhtar Donetsk,115
101,73185,Lucas Pérez,1988-09-10,35,Spain,104,Attack,3,Centre-Forward,2,...,2022,800000,17000000,premier-liga,Ukraine,15,2012-07-13,UKR1,Karpaty Lviv (-2021),180
289,42678,Salvatore Bocchetti,1986-11-30,37,Italy,59,Defender,1,Centre-Back,1,...,2020,200000,14000000,russian-super-cup,Russia,11,2012-07-14,RU1,FC Rubin Kazan,113


In [51]:
big5_df.tail()

Unnamed: 0,player_id,player_name,date_of_birth,age,country_of_citizenship,country_of_citizen#,position,position#,sub_position,sub_position#,...,last_season,market_value_in_eur,highest_market_value_in_eur,competition_name,competition_country,comp_countries#,date,domestic_league_code,club_name,club_name#
1556964,632349,Jarell Quansah,2003-01-29,21,England,36,Defender,1,Centre-Back,1,...,2023,12000000,12000000,fa-cup,England,3,2024-03-17,GB1,Liverpool Football Club,199
1556965,69633,Christian Eriksen,1992-02-14,32,Denmark,31,Midfield,2,Central Midfield,3,...,2023,10000000,100000000,fa-cup,England,3,2024-03-17,GB1,Manchester United Football Club,206
1556966,712117,Bobby Clark,2005-02-07,19,England,36,Midfield,2,Central Midfield,3,...,2023,1000000,1000000,fa-cup,England,3,2024-03-17,GB1,Liverpool Football Club,199
1556967,811779,Alejandro Garnacho,2004-07-01,19,Argentina,4,Attack,3,Left Winger,10,...,2023,40000000,40000000,fa-cup,England,3,2024-03-17,GB1,Manchester United Football Club,206
1556968,820374,Kobbie Mainoo,2005-04-19,18,England,36,Midfield,2,Central Midfield,3,...,2023,35000000,35000000,fa-cup,England,3,2024-03-17,GB1,Manchester United Football Club,206


In [52]:
big5_df.shape

(519905, 24)

In [53]:
#export big5_df
big5_df.to_pickle(os.path.join(path, '02 Data','Prepared Data','wrangled_big5.pkl'))