In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [31]:
# Constant variables
TABLE_SIZE = 1546
TRAIN_RATIO = 0.8

In [155]:
# Joins train and test datasets together to simplify the process
def concat_df(df1, df2):
    return pd.concat([df1, df2], ignore_index=True)

# Divides the overall dataset into train and test 
def divide_df(df):
    train_size = int(TABLE_SIZE*TRAIN_RATIO)
    
    return df.loc[:train_size - 1], df.loc[train_size:] 

In [158]:
train_data = pd.read_csv('data/train_stats.csv')
test_data = pd.read_csv('data/test_stats.csv')
all_data = concat_df(train_data, test_data)

In [159]:
all_data.drop(['Player_URL', 'Team_URL'], axis=1, inplace=True)

In [160]:
# Cleans the columns containing commas 
def clean_comma(column):
    column = column.strip()
    column = column[:column.find(',')] + '' + column[column.find(',') + 1:]
    return column

# Cleans the columns containing '\t' symbol
def clean_tab(column):
    return column.strip().replace('\t', '')

In [161]:
comma_columns = ['Team', 'Position']
tab_columns = ['Goals', 'Assists', 'Yel', 'Red', 'SpG', 'AerialsWon', 'MotM', 'Tackles',
               'Inter', 'Fouls', 'Offsides', 'Clear', 'Drb_x', 'Blocks', 'OwnG', 'KeyP',
               'Drb_y', 'Fouled', 'Off', 'Disp', 'UnsTch', 'Crosses', 'LongB', 'ThrB']

for column in comma_columns:
    all_data[column] = all_data[column].apply(clean_comma)
    
for column in tab_columns:
    all_data[column] = all_data[column].apply(clean_tab)

In [162]:
# Some columns with integer values contain symbol '-' instead of 0. The function fixes it
def remove_dashes(column):
    if column == '-':
        return 0
    return column

In [163]:
dash_columns = ['Goals', 'Assists', 'Yel', 'Red', 'SpG', 'AerialsWon', 'MotM', 'Tackles',
                'Inter', 'Fouls', 'Offsides', 'Clear', 'Drb_x', 'Drb_y', 'Blocks', 'OwnG', 'KeyP',
                'Fouled', 'Off', 'Disp', 'UnsTch', 'Crosses', 'LongB', 'ThrB']

for column in dash_columns:
    all_data[column] = all_data[column].apply(remove_dashes)

In [164]:
# Now we have 'Value' variable containing market values in thousands or millions, but we want to get just an integer
def value_scaling(value):
    value = value.strip()
    
    if value.endswith('k'):
        return int(float(value[value.find('€')+1:value.find('k')]) * 10**3)
    elif value.endswith('m'):
        return int(float(value[value.find('€')+1:value.find('m')]) * 10**6)

In [165]:
all_data['Value'] = all_data['Value'].apply(value_scaling)

In [169]:
# We can divide 'Apps' variable because the number of appearances in starting squad is demonstrated in parentheses 
def appearances_division(apps):
    return apps[:apps.find('(')], apps[apps.find('(')+1:apps.find(')')]

In [170]:
all_data['Overall_Apps'], all_data['Start_Apps'] = all_data['Apps'].apply(appearances_division)

ValueError: too many values to unpack (expected 2)

In [166]:
df = all_data.sample(3)

for column in df.columns:
    print(f'Column: {column}')
    display(df[column])

Column: Name


1278     Raúl Jiménez
500       Jon Pacheco
112     Daichi Kamada
Name: Name, dtype: object

Column: Team


1278           Fulham
500     Real Sociedad
112             Lazio
Name: Team, dtype: object

Column: Age


1278    33
500     23
112     27
Name: Age, dtype: int64

Column: Position


1278              FW
500             D(C)
112       AM(CLR),FW
Name: Position, dtype: object

Column: Apps


1278     18(6)
500      16(7)
112     17(12)
Name: Apps, dtype: object

Column: Mins


1278    1404
500     1521
112     1546
Name: Mins, dtype: int64

Column: Goals


1278    7
500     1
112     2
Name: Goals, dtype: object

Column: Assists


1278    0
500     1
112     2
Name: Assists, dtype: object

Column: Yel


1278    3
500     8
112     3
Name: Yel, dtype: object

Column: Red


1278    1
500     0
112     0
Name: Red, dtype: object

Column: SpG


1278    1.7
500     0.3
112       1
Name: SpG, dtype: object

Column: PS


1278    69.7
500     87.3
112     89.0
Name: PS, dtype: float64

Column: AerialsWon


1278    1.8
500     2.5
112     0.1
Name: AerialsWon, dtype: object

Column: MotM


1278    1
500     2
112     1
Name: MotM, dtype: object

Column: Rating


1278    6.67
500     6.61
112     6.62
Name: Rating, dtype: float64

Column: Tackles


1278    0.8
500     1.1
112     1.8
Name: Tackles, dtype: object

Column: Inter


1278    0.2
500     0.4
112     0.6
Name: Inter, dtype: object

Column: Fouls


1278    0.7
500     1.3
112     1.3
Name: Fouls, dtype: object

Column: Offsides


1278      0
500     0.9
112       0
Name: Offsides, dtype: object

Column: Clear


1278    0.5
500     2.3
112     0.2
Name: Clear, dtype: object

Column: Drb_x


1278    0.5
500     0.3
112     0.8
Name: Drb_x, dtype: object

Column: Blocks


1278    0.1
500     0.3
112     0.2
Name: Blocks, dtype: object

Column: OwnG


1278    0
500     0
112     0
Name: OwnG, dtype: object

Column: KeyP


1278    0.5
500     0.1
112     0.8
Name: KeyP, dtype: object

Column: Drb_y


1278    0.4
500     0.2
112     0.5
Name: Drb_y, dtype: object

Column: Fouled


1278    0.9
500       0
112     0.4
Name: Fouled, dtype: object

Column: Off


1278    0.5
500       0
112     0.1
Name: Off, dtype: object

Column: Disp


1278    1.3
500     0.1
112     0.5
Name: Disp, dtype: object

Column: UnsTch


1278    1.8
500     0.3
112     0.9
Name: UnsTch, dtype: object

Column: AvgP


1278    15.5
500     50.6
112     31.2
Name: AvgP, dtype: float64

Column: Crosses


1278    0.1
500       0
112       0
Name: Crosses, dtype: object

Column: LongB


1278    0.5
500     2.4
112     0.9
Name: LongB, dtype: object

Column: ThrB


1278    0.1
500       0
112     0.1
Name: ThrB, dtype: object

Column: Value


1278     5000000
500     15000000
112     18000000
Name: Value, dtype: int64

In [167]:
display(all_data.sample(30))

Unnamed: 0,Name,Team,Age,Position,Apps,Mins,Goals,Assists,Yel,Red,...,Drb_y,Fouled,Off,Disp,UnsTch,AvgP,Crosses,LongB,ThrB,Value
195,Samuele Birindelli,Monza,24,"D(R),M(R)",22(13),2016,0,2,7,0,...,0.3,0.9,0.0,0.3,1.0,25.6,0.5,1.4,0.0,3500000
888,Youri Tielemans,Aston Villa,27,M(C),17(15),1622,2,6,3,0,...,0.5,0.9,0.0,0.7,0.5,28.3,0.1,1.1,0.2,25000000
226,Bilal Boutobba,Clermont Foot,25,AM(R),10(17),921,1,1,1,0,...,0.6,1.0,0.1,0.6,1.0,12.3,0.6,0.2,0.1,1800000
746,Mama Baldé,Lyon,28,"D(R),M(CLR),FW",4(16),543,2,3,1,0,...,0.6,0.5,0.2,0.6,1.4,6.5,0.0,0.2,0.1,4000000
89,Thomas Kaminski,Luton,31,GK,38,3406,0,0,3,0,...,0.1,0.1,0.0,0.0,0.0,34.5,0.0,8.2,0.0,3000000
443,Danny Ings,West Ham,31,"AM(C),FW",3(17),392,1,0,1,0,...,0.3,0.3,0.1,0.2,0.5,5.3,0.0,0.1,0.0,8000000
1124,Jay Rodríguez,Burnley,34,"AM(CLR),FW",7(14),815,2,1,0,0,...,0.3,0.3,0.3,0.2,1.0,10.2,0.0,0.3,0.0,1000000
910,Brajan Gruda,Mainz,20,AM(C),19(9),1652,4,3,5,0,...,2.5,1.7,0.3,1.8,2.5,19.0,0.4,0.4,0.1,20000000
294,Alessio Romagnoli,Lazio,29,D(CL),26(3),2410,0,0,9,0,...,0.0,0.7,0.0,0.0,0.3,60.3,0.0,1.7,0.0,15000000
189,Lorenzo Pellegrini,Roma,28,M(CL),20(9),1867,8,3,7,0,...,0.7,1.2,0.1,0.7,1.4,26.6,0.9,0.5,0.1,25000000
