# Data collection of soccer players

## Libraries

In [3]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from scipy.stats import norm, skew, kurtosis, anderson, kstest
import fitter
from fitter import Fitter, get_common_distributions, get_distributions
import seaborn as sns

## Open csv files

In [45]:
age_df = pd.read_csv("AGE.csv")
date_of_birth =  pd.read_csv('players.csv', usecols= ['player_id', 'birth_date'])
mv_df =  pd.read_csv('market_value_df.csv')
player_data =  pd.read_csv('player table189.csv')
stc_data = pd.read_csv('statistics table189.csv')


### clean age_df data

In [34]:
age_df.loc[age_df['Age'].str.contains('†'), 'Age'] = age_df['Age'].str.replace('†', '')

### clean player_df data

In [42]:
player_df = player_data.drop(player_data.columns[0], axis=1)
player_df = player_df.drop('birth_date', axis=1)

height_median = player_df['height'].median()
# Replace NaN values with the median of the 'height' column
player_df['height'] = player_df['height'].fillna(height_median)

player_df['main_position'].replace(['Attack Left Winger','Attack Centre Forward', 'Attack Right Winger', 'Attack Second Striker'], 'striker', inplace=True)
player_df['main_position'].replace(['Defender Centre Back', 'Defender Right Back', 'Defender Left Back'], 'defender', inplace=True)
player_df['main_position'].replace(['midfield Central Midfield', 'midfield Attacking Midfield', 'midfield Defensive Midfield', 'midfield', 'midfield Left Midfield', 'midfield Right Midfield'], 'midfielder', inplace=True)

player_df['goals_scored'].replace('-', np.nan, inplace=True)
player_df['goals_assisted'].replace('-', np.nan, inplace=True)
player_df['total_appearence'].replace('-', np.nan, inplace=True)
player_df['agent'] = player_df['agent'].replace('', np.nan)

player_df['goals_conceded'] = np.where(player_df['main_position'] != 'Goalkeeper', 'not defined',
                       np.where(player_df['goals_conceded'] == '-', np.nan, player_df['goals_conceded']))

player_df['clean_sheets'] = np.where(player_df['main_position'] != 'Goalkeeper', 'not defined',
                       np.where(player_df['clean_sheets'] == '-', np.nan, player_df['clean_sheets']))

In [None]:
print(age_df[age_df['Age'].str.contains('†', na=False)])
print(player_df['main_position'].unique())

### clean stc_df data

In [81]:
stc_df = stc_data.drop(stc_data.columns[0], axis=1)

stc_df['Appearances'].replace('-', np.nan, inplace=True)
stc_df['PPG'].replace('-', np.nan, inplace=True)
stc_df['PPG'].replace('0,00', 0, inplace=True)
stc_df['Goals'].replace('-', np.nan, inplace=True)
stc_df['Own goals'].replace('-', np.nan, inplace=True)
stc_df['Substitutions on'].replace('-', np.nan, inplace=True)
stc_df['Substitutions off'].replace('-', np.nan, inplace=True)
stc_df['Yellow cards'].replace('-', np.nan, inplace=True)
stc_df['Second yellow cards'].replace('-', np.nan, inplace=True)
stc_df['Red cards'].replace('-', np.nan, inplace=True)
stc_df['Assists'].replace('-', np.nan, inplace=True)
stc_df['Penalty goals'].replace('-', np.nan, inplace=True)
stc_df['Minutes per goal'].replace('-', np.nan, inplace=True)

stc_df = pd.merge(stc_df, player_df[['player_id', 'main_position']], on='player_id', suffixes=('_stc_df', '_player_df'))
stc_df['Goals conceded'] = np.where(stc_df['main_position'] != 'Goalkeeper', 'not defined',
                       np.where(stc_df['Goals conceded'] == '-', np.nan, stc_df['Goals conceded']))

stc_df['Clean sheets'] = np.where(stc_df['main_position'] != 'Goalkeeper', 'not defined',
                       np.where(stc_df['Clean sheets'] == '-', np.nan, stc_df['Clean sheets']))

ppg_median = stc_df['PPG'].median()
# Replace NaN values with the median of the 'PPG' column
stc_df['PPG'] = stc_df['PPG'].fillna(ppg_median)

In [None]:
print(stc_df.dtypes)

In [91]:
stc_df.head()

Unnamed: 0,player_id,Season,Competition,Club,Squad,Appearances,PPG,Goals,Own goals,Substitutions on,...,Yellow cards,Second yellow cards,Red cards,Goals conceded,Clean sheets,Minutes played,Assists,Penalty goals,Minutes per goal,main_position
0,108390,21/22,Champions League,Real Madrid,13,13,2.08,,,,...,,,,14,5.0,1.230',,,,Goalkeeper
1,108390,21/22,LaLiga,Real Madrid,37,36,2.36,,,,...,1.0,,,29,16.0,3.240',,,,Goalkeeper
2,108390,21/22,Copa del Rey,Real Madrid,1,1,0.0,,,,...,,,,1,,90',,,,Goalkeeper
3,108390,21/22,Supercopa,Real Madrid,2,2,3.0,,,,...,,,,2,1.0,210',,,,Goalkeeper
4,108390,20/21,LaLiga,Real Madrid,38,38,2.21,,,,...,,,,28,17.0,3.420',,,,Goalkeeper
