# Data collection of soccer players

## Libraries

In [122]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts
from scipy.stats import norm, skew, kurtosis, anderson, kstest
import seaborn as sns

## Open csv files

In [123]:
age_df = pd.read_csv("AGE.csv")
date_of_birth =  pd.read_csv('players.csv', usecols= ['player_id', 'birth_date'])
mv_df =  pd.read_csv('market_value_df.csv')
player_data =  pd.read_csv('player table189.csv')
stc_data = pd.read_csv('player_season_preprocessed.csv')

### clean age_df data

In [124]:
age_df.loc[age_df['Age'].str.contains('†'), 'Age'] = age_df['Age'].str.replace('†', '')

### clean player_df data

In [125]:
player_df = player_data.drop(player_data.columns[0], axis=1)
player_df = player_df.drop('birth_date', axis=1)

height_median = player_df['height'].median()
# Replace NaN values with the median of the 'height' column
player_df['height'] = player_df['height'].fillna(height_median)

player_df['main_position'].replace(['Attack Left Winger','Attack Centre Forward', 'Attack Right Winger', 'Attack Second Striker'], 'striker', inplace=True)
player_df['main_position'].replace(['Defender Centre Back', 'Defender Right Back', 'Defender Left Back'], 'defender', inplace=True)
player_df['main_position'].replace(['midfield Central Midfield', 'midfield Attacking Midfield', 'midfield Defensive Midfield', 'midfield', 'midfield Left Midfield', 'midfield Right Midfield'], 'midfielder', inplace=True)

player_df['goals_scored'].replace('-', np.nan, inplace=True)
player_df['goals_assisted'].replace('-', np.nan, inplace=True)
player_df['total_appearence'].replace('-', np.nan, inplace=True)
player_df['agent'] = player_df['agent'].replace('', np.nan)

player_df['goals_conceded'] = np.where(player_df['main_position'] != 'Goalkeeper', 'not defined',
                       np.where(player_df['goals_conceded'] == '-', np.nan, player_df['goals_conceded']))

player_df['clean_sheets'] = np.where(player_df['main_position'] != 'Goalkeeper', 'not defined',
                       np.where(player_df['clean_sheets'] == '-', np.nan, player_df['clean_sheets']))

In [126]:
print(age_df[age_df['Age'].str.contains('†', na=False)])
print(player_df['main_position'].unique())

Empty DataFrame
Columns: [season, Age, player_id, season_short]
Index: []
['Goalkeeper' 'defender' 'midfielder' 'striker']


### clean stc_df data

In [127]:
stc_df = stc_data.drop(stc_data.columns[0], axis=1)

stc_df['Appearances'].replace('-', np.nan, inplace=True)
stc_df['PPG'].replace('-', np.nan, inplace=True)
stc_df['PPG'].replace('0,00', 0, inplace=True)
stc_df['Goals'].replace('-', np.nan, inplace=True)
stc_df['Own goals'].replace('-', np.nan, inplace=True)
stc_df['Substitutions on'].replace('-', np.nan, inplace=True)
stc_df['Substitutions off'].replace('-', np.nan, inplace=True)
stc_df['Yellow cards'].replace('-', np.nan, inplace=True)
stc_df['Second yellow cards'].replace('-', np.nan, inplace=True)
stc_df['Red cards'].replace('-', np.nan, inplace=True)
stc_df['Assists'].replace('-', np.nan, inplace=True)
stc_df['Penalty goals'].replace('-', np.nan, inplace=True)
stc_df['Minutes per goal'].replace('-', np.nan, inplace=True)

stc_df = pd.merge(stc_df, player_df[['player_id', 'main_position']], on='player_id', suffixes=('_stc_df', '_player_df'))
stc_df['Goals conceded'] = np.where(stc_df['main_position'] != 'Goalkeeper', 'not defined',
                       np.where(stc_df['Goals conceded'] == '-', np.nan, stc_df['Goals conceded']))

stc_df['Clean sheets'] = np.where(stc_df['main_position'] != 'Goalkeeper', 'not defined',
                       np.where(stc_df['Clean sheets'] == '-', np.nan, stc_df['Clean sheets']))

ppg_median = stc_df['PPG'].median()
# Replace NaN values with the median of the 'PPG' column
stc_df['PPG'] = stc_df['PPG'].fillna(ppg_median)

KeyError: 'player_id'

In [69]:
def year_transformer(raw_year):
    year = raw_year.split('/')[0]
    if len(year) == 2:
        if int(year) <=22:
            return '20' + year
        else:
            return '19' + year
    elif len(year) == 4:
        return year

In [70]:
stc_df['Season'] = stc_df['Season'].apply(lambda x: year_transformer(x))

In [79]:
stc_df.rename(columns={'Season':'season'}, inplace=True)

In [97]:
stc_df['season'] = stc_df['season'].astype(int)

In [98]:
stc_df.head()

Unnamed: 0,player_id,season,Competition,Club,Squad,Appearances,PPG,Goals,Own goals,Substitutions on,...,Yellow cards,Second yellow cards,Red cards,Goals conceded,Clean sheets,Minutes played,Assists,Penalty goals,Minutes per goal,main_position
0,108390,2021,Champions League,Real Madrid,13,13,2.08,,,,...,,,,14,5.0,1.230',,,,Goalkeeper
1,108390,2021,LaLiga,Real Madrid,37,36,2.36,,,,...,1.0,,,29,16.0,3.240',,,,Goalkeeper
2,108390,2021,Copa del Rey,Real Madrid,1,1,0.0,,,,...,,,,1,,90',,,,Goalkeeper
3,108390,2021,Supercopa,Real Madrid,2,2,3.0,,,,...,,,,2,1.0,210',,,,Goalkeeper
4,108390,2020,LaLiga,Real Madrid,38,38,2.21,,,,...,,,,28,17.0,3.420',,,,Goalkeeper


In [99]:
print(stc_df.dtypes)

player_id               int64
season                  int64
Competition            object
Club                   object
Squad                   int64
Appearances            object
PPG                    object
Goals                  object
Own goals              object
Substitutions on       object
Substitutions off      object
Yellow cards           object
Second yellow cards    object
Red cards              object
Goals conceded         object
Clean sheets           object
Minutes played         object
Assists                object
Penalty goals          object
Minutes per goal       object
main_position          object
dtype: object


In [100]:
stc_df.shape

(50187, 21)

In [101]:
np.shape(mv_df)

(12594, 3)

In [102]:
np.shape(stc_df)

(50187, 21)

In [103]:
Goalkeeper_df = stc_df[stc_df['main_position'] == 'Goalkeeper'] 
striker_df = stc_df[stc_df['main_position'] == 'striker'] 
defender_df = stc_df[stc_df['main_position'] == 'defender'] 
midfielder_df = stc_df[stc_df['main_position'] == 'midfielder']

In [104]:
np.unique(Goalkeeper_df['main_position'])

array(['Goalkeeper'], dtype=object)

In [105]:
Goalkeeper_df.shape

(5831, 21)

In [106]:
Goalkeeper_df.drop(['Goals', 'Own goals', 'Substitutions on', 'Substitutions off',
                   'Yellow cards', 'Second yellow cards', 'Red cards', 'Minutes played',
                   ], inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Goalkeeper_df.drop(['Goals', 'Own goals', 'Substitutions on', 'Substitutions off',


In [107]:

Goalkeeper_df_m = Goalkeeper_df.merge(mv_df, on=['player_id', 'season'], how='left') 

In [108]:
Goalkeeper_df_m.shape

(5864, 14)

In [110]:
Goalkeeper_df_m = Goalkeeper_df_m.dropna(subset=['market_value'])
Goalkeeper_df_m 

Unnamed: 0,player_id,season,Competition,Club,Squad,Appearances,PPG,Goals conceded,Clean sheets,Assists,Penalty goals,Minutes per goal,main_position,market_value
28,404839,2021,Champions League,Real Madrid,13,,1.5,,,,,,Goalkeeper,2500000.0
29,404839,2021,LaLiga,Real Madrid,34,2,0.50,2,,,,,Goalkeeper,2500000.0
30,404839,2021,Copa del Rey,Real Madrid,3,2,3.00,2,,,,,Goalkeeper,2500000.0
31,404839,2021,Supercopa,Real Madrid,2,,1.5,,,,,,Goalkeeper,2500000.0
32,404839,2020,LaLiga,Real Madrid,38,,1.5,,,,,,Goalkeeper,3000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5814,45494,2016,Premier League,Crystal Palace,37,29,1.03,46,7,,,,Goalkeeper,4000000.0
5815,45494,2016,FA Cup,Crystal Palace,2,1,0,3,,,,,Goalkeeper,4000000.0
5816,45494,2016,EFL Cup,Crystal Palace,1,1,0,2,,,,,Goalkeeper,4000000.0
5820,285033,2021,Premier League,Burnley FC,9,,1.5,,,,,,Goalkeeper,300000.0
