# Data collection of soccer players

## Libraries

In [145]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts
from scipy.stats import norm, skew, kurtosis, anderson, kstest
import seaborn as sns

## Open csv files

In [146]:
age_df = pd.read_csv("AGE.csv")
date_of_birth =  pd.read_csv('players.csv', usecols= ['player_id', 'birth_date'])
mv_df =  pd.read_csv('market_value_df.csv')
player_data =  pd.read_csv('player table189.csv')
stc_data = pd.read_csv('player_season_preprocessed.csv')

### clean age_df data

In [147]:
age_df.loc[age_df['Age'].str.contains('†'), 'Age'] = age_df['Age'].str.replace('†', '')

### clean player_df data

In [148]:
player_df = player_data.drop(player_data.columns[0], axis=1)
player_df = player_df.drop('birth_date', axis=1)

height_median = player_df['height'].median()
# Replace NaN values with the median of the 'height' column
player_df['height'] = player_df['height'].fillna(height_median)

player_df['main_position'].replace(['Attack Left Winger','Attack Centre Forward', 'Attack Right Winger', 'Attack Second Striker'], 'striker', inplace=True)
player_df['main_position'].replace(['Defender Centre Back', 'Defender Right Back', 'Defender Left Back'], 'defender', inplace=True)
player_df['main_position'].replace(['midfield Central Midfield', 'midfield Attacking Midfield', 'midfield Defensive Midfield', 'midfield', 'midfield Left Midfield', 'midfield Right Midfield'], 'midfielder', inplace=True)

player_df['goals_scored'].replace('-', np.nan, inplace=True)
player_df['goals_assisted'].replace('-', np.nan, inplace=True)
player_df['total_appearence'].replace('-', np.nan, inplace=True)
player_df['agent'] = player_df['agent'].replace('', np.nan)

player_df['goals_conceded'] = np.where(player_df['main_position'] != 'Goalkeeper', 'not defined',
                       np.where(player_df['goals_conceded'] == '-', np.nan, player_df['goals_conceded']))

player_df['clean_sheets'] = np.where(player_df['main_position'] != 'Goalkeeper', 'not defined',
                       np.where(player_df['clean_sheets'] == '-', np.nan, player_df['clean_sheets']))

In [149]:
print(age_df[age_df['Age'].str.contains('†', na=False)])
print(player_df['main_position'].unique())

Empty DataFrame
Columns: [season, Age, player_id, season_short]
Index: []
['Goalkeeper' 'defender' 'midfielder' 'striker']


### clean stc_df data

In [150]:
stc_df = stc_data

stc_df['Appearances'].replace('-', np.nan, inplace=True)
stc_df['PPG'].replace('-', np.nan, inplace=True)
stc_df['PPG'].replace('0,00', 0, inplace=True)
stc_df['Goals'].replace('-', np.nan, inplace=True)
stc_df['Own goals'].replace('-', np.nan, inplace=True)
stc_df['Substitutions on'].replace('-', np.nan, inplace=True)
stc_df['Substitutions off'].replace('-', np.nan, inplace=True)
stc_df['Yellow cards'].replace('-', np.nan, inplace=True)
stc_df['Second yellow cards'].replace('-', np.nan, inplace=True)
stc_df['Red cards'].replace('-', np.nan, inplace=True)
stc_df['Assists'].replace('-', np.nan, inplace=True)
stc_df['Penalty goals'].replace('-', np.nan, inplace=True)
stc_df['Minutes per goal'].replace('-', np.nan, inplace=True)

stc_df['Season']= stc_df['Season'].astype(str)

ppg_median = stc_df['PPG'].median()
# Replace NaN values with the median of the 'PPG' column
stc_df['PPG'] = stc_df['PPG'].fillna(ppg_median)

In [151]:
def year_transformer(raw_year):
    year = raw_year.split('/')[0]
    if len(year) == 2:
        if int(year) <=22:
            return '20' + year
        else:
            return '19' + year
    elif len(year) == 4:
        return year

In [152]:
stc_df['Season'] = stc_df['Season'].apply(lambda x: year_transformer(x))

In [153]:
stc_df.rename(columns={'Season':'season'}, inplace=True)

In [154]:
stc_df['season'] = stc_df['season'].astype(int)

In [155]:
stc_df.head()

Unnamed: 0,player_id,season,Age,Competition,Club,club_id,position_group,New_season,Squad,Appearances,...,Own goals,Assists,Penalty goals,Yellow cards,Second yellow cards,Red cards,Substitutions on,Substitutions off,Minutes played,Minutes per goal
0,2857,2017,32,Premier League,Leicester City,1003,Goalkeeper,False,10,2,...,0,0,0,0,0,0,0,0,180,0
1,16911,2017,30,Premier League,Leicester City,1003,Goalkeeper,False,33,33,...,1,0,0,3,0,0,0,1,2966,0
2,56810,2017,28,Premier League,Leicester City,1003,midfield,False,15,12,...,0,1,0,3,0,0,3,4,761,0
3,61560,2017,27,Premier League,Leicester City,1003,midfield,False,35,34,...,0,7,0,5,0,1,4,10,2543,1272
4,65467,2017,29,Premier League,Leicester City,1003,midfield,False,29,19,...,0,0,0,0,0,0,2,7,1414,471


In [156]:
print(stc_df.dtypes)

player_id                int64
season                   int64
Age                      int64
Competition             object
Club                    object
club_id                  int64
position_group          object
New_season                bool
Squad                    int64
Appearances              int64
club_clean_sheet         int64
player_clean_sheet       int64
Goals conceded           int64
PPG                    float64
Goals                    int64
Own goals                int64
Assists                  int64
Penalty goals            int64
Yellow cards             int64
Second yellow cards      int64
Red cards                int64
Substitutions on         int64
Substitutions off        int64
Minutes played           int64
Minutes per goal         int64
dtype: object


In [157]:
stc_df.shape

(7432, 25)

In [158]:
np.shape(mv_df)

(12594, 3)

In [159]:
np.shape(stc_df)

(7432, 25)

In [189]:
np.unique(stc_df['position_group'])

array(['Attack', 'Defender', 'Goalkeeper', 'midfield'], dtype=object)

In [209]:
Goalkeeper_df = stc_df[stc_df['position_group'] == 'Goalkeeper'] 
striker_df = stc_df[stc_df['position_group'] == 'Attack'] 
defender_df = stc_df[stc_df['position_group'] == 'Defender'] 
midfielder_df = stc_df[stc_df['position_group'] == 'midfield']

In [210]:
np.unique(Goalkeeper_df['position_group'])

array(['Goalkeeper'], dtype=object)

In [211]:
Goalkeeper_df.shape

(794, 25)

In [212]:
Goalkeeper_df.drop(['Goals', 'Own goals', 'Substitutions on', 'Substitutions off',
                   'Yellow cards', 'Second yellow cards', 'Red cards', 'Minutes played',
                   'Minutes per goal'], inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Goalkeeper_df.drop(['Goals', 'Own goals', 'Substitutions on', 'Substitutions off',


In [213]:
Goalkeeper_df_m = Goalkeeper_df.merge(mv_df, on=['player_id', 'season'], how='left') 

In [214]:
Goalkeeper_df_m.shape

(801, 17)

In [215]:
Goalkeeper_df_m = Goalkeeper_df_m.dropna(subset=['market_value'])
Goalkeeper_df_m 

Unnamed: 0,player_id,season,Age,Competition,Club,club_id,position_group,New_season,Squad,Appearances,club_clean_sheet,player_clean_sheet,Goals conceded,PPG,Assists,Penalty goals,market_value
0,2857,2017,32,Premier League,Leicester City,1003,Goalkeeper,False,10,2,8,0,6,1.50,0,0,1000000.0
3,16911,2021,34,Premier League,Leicester City,1003,Goalkeeper,False,38,37,7,7,58,1.32,0,0,4000000.0
9,14044,2019,35,Premier League,Liverpool FC,31,Goalkeeper,False,2,0,17,0,0,0.00,0,0,250000.0
15,340918,2020,21,Premier League,Liverpool FC,31,Goalkeeper,False,15,2,12,1,1,1.50,0,0,2500000.0
17,71271,2021,34,Premier League,Liverpool FC,31,Goalkeeper,False,5,0,21,0,0,0.00,0,0,1000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
783,79045,2020,29,Serie A,Spezia Calcio,3522,Goalkeeper,False,20,7,5,1,12,1.43,0,0,2000000.0
785,79045,2021,30,Serie A,Spezia Calcio,3522,Goalkeeper,False,36,7,8,0,19,0.57,0,0,850000.0
792,208166,2018,20,Serie A,ACF Fiorentina,430,Goalkeeper,False,20,3,10,1,2,1.33,0,0,10000000.0
795,448632,2019,19,Ligue 1,Stade Reims,1421,Goalkeeper,True,1,0,12,0,0,0.00,0,0,350000.0


In [217]:
striker_df.shape

(1940, 25)

In [218]:
striker_df.drop(['Own goals', 'Substitutions on', 'Substitutions off',
                   'Yellow cards', 'Second yellow cards', 'Red cards', 'Minutes played',
                'club_clean_sheet', 'player_clean_sheet', 'Goals conceded'], inplace=True, axis=1)
striker_df_m = striker_df.merge(mv_df, on=['player_id', 'season'], how='left')
striker_df_m = striker_df_m.dropna(subset=['market_value'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  striker_df.drop(['Own goals', 'Substitutions on', 'Substitutions off',


In [219]:
striker_df_m

Unnamed: 0,player_id,season,Age,Competition,Club,club_id,position_group,New_season,Squad,Appearances,PPG,Goals,Assists,Penalty goals,Minutes per goal,market_value
1,174915,2017,29,Premier League,Leicester City,1003,Attack,False,19,12,1.00,1,1,0,239,18000000.0
2,197838,2017,30,Premier League,Leicester City,1003,Attack,False,37,37,1.19,20,1,5,163,20000000.0
4,295330,2017,20,Premier League,Leicester City,1003,Attack,True,32,21,0.81,3,3,0,274,15000000.0
5,398065,2017,19,Premier League,Leicester City,1003,Attack,False,6,3,1.33,0,0,0,0,250000.0
6,197838,2021,34,Premier League,Leicester City,1003,Attack,False,27,25,1.56,15,2,0,120,5000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014,296783,2017,20,Ligue 1,LOSC Lille,1082,Attack,False,36,28,1.11,2,1,0,542,2500000.0
2016,427161,2017,21,Ligue 1,LOSC Lille,1082,Attack,False,14,14,0.93,5,1,0,238,3000000.0
2017,435485,2017,21,Ligue 1,LOSC Lille,1082,Attack,False,37,34,0.82,5,3,0,403,7000000.0
2018,129990,2017,26,Ligue 1,ESTAC Troyes,1095,Attack,False,29,26,0.69,6,0,0,199,3500000.0


In [220]:
defender_df.drop([], inplace=True, axis=1)
defender_df_m = defender_df.merge(mv_df, on=['player_id', 'season'], how='left')
defender_df_m = defender_df_m.dropna(subset=['market_value'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  defender_df.drop([], inplace=True, axis=1)


In [221]:
defender_df_m

Unnamed: 0,player_id,season,Age,Competition,Club,club_id,position_group,New_season,Squad,Appearances,...,Assists,Penalty goals,Yellow cards,Second yellow cards,Red cards,Substitutions on,Substitutions off,Minutes played,Minutes per goal,market_value
0,177907,2017,24,Premier League,Leicester City,1003,Defender,False,38,38,...,3,0,7,0,0,0,0,3420,1710,25000000.0
1,214056,2017,22,Premier League,Leicester City,1003,Defender,False,19,8,...,0,0,0,1,0,2,2,490,0,5000000.0
2,316125,2017,20,Premier League,Leicester City,1003,Defender,False,32,24,...,2,0,1,1,0,4,3,1852,0,8000000.0
5,42412,2021,33,Premier League,Leicester City,1003,Defender,False,20,18,...,1,0,3,0,0,2,4,1350,1350,4000000.0
6,99331,2021,28,Premier League,Leicester City,1003,Defender,True,25,10,...,0,0,2,0,0,4,1,563,0,13000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2568,126673,2017,25,Ligue 1,LOSC Lille,1082,Defender,False,17,14,...,0,0,2,0,0,0,3,1194,0,3000000.0
2569,126711,2017,25,Ligue 1,LOSC Lille,1082,Defender,False,23,23,...,5,0,2,0,0,2,2,1816,0,7500000.0
2570,296422,2017,20,Ligue 1,LOSC Lille,1082,Defender,True,34,27,...,0,0,5,0,1,2,6,2057,0,2000000.0
2572,170544,2017,24,Ligue 1,ESTAC Troyes,1095,Defender,False,34,34,...,0,0,7,0,2,0,1,2959,0,6000000.0


In [222]:
midfielder_df.drop([], inplace=True, axis=1)
midfielder_df_m = midfielder_df.merge(mv_df, on=['player_id', 'season'], how='left')
midfielder_df_m = midfielder_df_m.dropna(subset=['market_value'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  midfielder_df.drop([], inplace=True, axis=1)


In [223]:
midfielder_df_m

Unnamed: 0,player_id,season,Age,Competition,Club,club_id,position_group,New_season,Squad,Appearances,...,Assists,Penalty goals,Yellow cards,Second yellow cards,Red cards,Substitutions on,Substitutions off,Minutes played,Minutes per goal,market_value
1,61560,2017,27,Premier League,Leicester City,1003,midfield,False,35,34,...,7,0,5,0,1,4,10,2543,1272,10000000.0
2,65467,2017,29,Premier League,Leicester City,1003,midfield,False,29,19,...,0,0,0,0,0,2,7,1414,471,10000000.0
4,341501,2017,19,Premier League,Leicester City,1003,midfield,False,8,8,...,1,0,1,0,0,4,2,375,0,1000000.0
7,129588,2021,27,Premier League,Leicester City,1003,midfield,False,3,0,...,0,0,0,0,0,0,0,0,0,13000000.0
8,249565,2021,24,Premier League,Leicester City,1003,midfield,False,33,32,...,4,2,3,0,0,3,5,2634,439,55000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2222,135617,2017,25,Ligue 1,LOSC Lille,1082,midfield,False,32,31,...,4,0,3,1,0,4,9,2298,766,5500000.0
2224,277635,2017,20,Ligue 1,LOSC Lille,1082,midfield,False,9,6,...,1,0,0,0,0,5,0,195,0,600000.0
2225,290256,2017,22,Ligue 1,LOSC Lille,1082,midfield,False,2,1,...,0,0,0,0,0,1,0,14,0,5500000.0
2227,410425,2017,20,Ligue 1,LOSC Lille,1082,midfield,False,30,24,...,1,0,3,0,0,9,3,1555,778,8000000.0


In [None]:
Goalkeeper_df_m.to_csv('Goalkeeper_df.csv')
striker_df_m.to_csv('striker_df.csv')
defender_df_m.to_csv('defender_df.csv')
midfielder_df_m.to_csv('midfielder_df.csv')