# The Football Lab: Predicting Player Impact by Position

In [72]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import sklearn 

In [73]:

raw_dataset_23 = pd.read_csv("datasets/2022-2023_Football_Player_Stats.csv", encoding="latin1")
raw_dataset_22 = pd.read_csv("datasets/2021-2022_Football_Player_Stats.csv", encoding="latin1")


## Cleaning & Structuring the Dataset

### Structuring the DataSet
Here in this section we're going to manipulate the dataset. The manipulation are made so it's easier to use it.     
Let's start adding a row with the name of all columns..

In [74]:
## Data Preprocessing
raw_dataset_22.loc[len(raw_dataset_22)] = raw_dataset_22.columns

raw_dataset_22 = raw_dataset_22.iloc[:,0].str.split(";", expand=True)
raw_dataset_22.columns =raw_dataset_22.iloc[len(raw_dataset_22)-1]

raw_dataset_22 = raw_dataset_22.drop(index =len(raw_dataset_22)-1)

raw_dataset_22.head()
raw_dataset_22.info()


<class 'pandas.core.frame.DataFrame'>
Index: 2921 entries, 0 to 2920
Columns: 143 entries, Rk to AerWon%
dtypes: object(143)
memory usage: 3.2+ MB


In [75]:
## Data Preprocessing
raw_dataset_23.loc[len(raw_dataset_23)] = raw_dataset_23.columns

raw_dataset_23 = raw_dataset_23.iloc[:,0].str.split(";", expand=True)
raw_dataset_23.columns =raw_dataset_23.iloc[len(raw_dataset_23)-1]

raw_dataset_23 = raw_dataset_23.drop(index =len(raw_dataset_23)-1)

raw_dataset_23.head()

2689,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%
0,1,Brenden Aaronson,USA,MFFW,Leeds United,Premier League,22,2000,20,19,...,0.17,2.54,0.51,0,0,0.0,4.86,0.34,1.19,22.2
1,2,Yunis Abdelhamid,MAR,DF,Reims,Ligue 1,35,1987,22,22,...,0.05,0.18,1.59,0,0,0.0,6.64,2.18,1.23,64.0
2,3,Himad Abdelli,FRA,MFFW,Angers,Ligue 1,23,1999,14,8,...,0.0,1.05,1.4,0,0,0.0,8.14,0.93,1.05,47.1
3,4,Salis Abdul Samed,GHA,MF,Lens,Ligue 1,22,2000,20,20,...,0.0,0.35,0.8,0,0,0.05,6.6,0.5,0.5,50.0
4,5,Laurent Abergel,FRA,MF,Lorient,Ligue 1,30,1993,15,15,...,0.0,0.23,2.02,0,0,0.0,6.51,0.31,0.39,44.4


Here we are going to delete all the non common columns between the two datasets and the duplicated row.

In [76]:
common_columns = raw_dataset_23.columns.intersection(raw_dataset_22.columns)

raw_dataset_22 = raw_dataset_22[common_columns]
raw_dataset_23 = raw_dataset_23[common_columns]

# Remove duplicates from the dataset
raw_dataset_22 = raw_dataset_22[~raw_dataset_22['Player'].duplicated(keep=False)]
#raw_dataset_23 = raw_dataset_23[~raw_dataset_23['Player'].duplicated(keep=False)]

In [None]:
columns_to_keep = ["Player", "Nation", "Pos", "Comp", "Min", "MP", "SoT%", "PasTotCmp%",         "Goals", "Assists", "TklWon" ,"Shots", "SoT", "ShoDist", "ShoPK", "PasTotCmp", "ScaSh", "GCA", "PKatt", "Touches", "PasTotDist", "PasAss", "CrsPA", "ScaPassLive", "Carries", "CarPrgDist", "PPA", "Tkl", "Int", "Clr","BlkSh", "TouDef3rd", "AerWon", "Err", "Fls", "PasLonCmp", "PasLonAtt", "PasDead", "CrdR", "Recov"]

columns_to_convert = ["Min", "MP", "SoT%", "PasTotCmp%",         "Goals", "Assists", "TklWon" ,"Shots", "SoT", "ShoDist", "ShoPK", "PasTotCmp", "ScaSh", "GCA", "PKatt", "Touches", "PasTotDist", "PasAss", "CrsPA", "ScaPassLive", "Carries", "CarPrgDist", "PPA", "Tkl", "Int", "Clr","BlkSh", "TouDef3rd", "AerWon", "Err", "Fls", "PasLonCmp", "PasLonAtt", "PasDead", "CrdR", "Recov"]

raw_dataset_22["Min"] = raw_dataset_22["Min"].apply(lambda col: pd.to_numeric(col, errors='coerce'))
raw_dataset_23["Min"] = raw_dataset_23["Min"].apply(lambda col: pd.to_numeric(col, errors='coerce'))

column_not_to_norm = ["Player", "Nation", "Pos", "Comp", "Min", "MP", "SoT%", "PasTotCmp%"]

dataset_22 = raw_dataset_22[columns_to_keep]
dataset_23 = raw_dataset_23[columns_to_keep]

for col in columns_to_convert:
    dataset_22.loc[:, col] = pd.to_numeric(dataset_22[col], errors='coerce')
    dataset_23.loc[:, col] = pd.to_numeric(dataset_23[col], errors='coerce')

In this section we are going to normalize certain columns for the season.

In [78]:
# here we are going to normalize all the columns as follows:
# value = value * Minutes Played / 90

columns_to_normalize = ["Goals", "Assists", "TklWon" ,"Shots", "SoT", "ShoDist", "ShoPK", "PasTotCmp", "ScaSh", "GCA", "PKatt", "Touches", "PasTotDist", "PasAss", "CrsPA", "ScaPassLive", "Carries", "CarPrgDist", "PPA", "Tkl", "Int", "Clr","BlkSh", "TouDef3rd", "AerWon", "Err", "Fls", "PasLonCmp", "PasLonAtt", "PasDead", "CrdR", "Recov"]


for index, line in dataset_22.iterrows():
    for column in columns_to_normalize:
        dataset_22.at[index, column] = round(dataset_22.at[index, column] * raw_dataset_22.at[index, "Min"] / 90, 0)

for index, line in dataset_23.iterrows():
    for column in columns_to_normalize:
        dataset_23.at[index, column] = round(dataset_23.at[index, column] * raw_dataset_23.at[index, "Min"] / 90, 0)

dataset_22.head()

2921,Player,Nation,Pos,Comp,Min,MP,SoT%,PasTotCmp%,Goals,Assists,...,BlkSh,TouDef3rd,AerWon,Err,Fls,PasLonCmp,PasLonAtt,PasDead,CrdR,Recov
0,Max Aarons,ENG,DF,Premier League,2881,34,15.4,75.5,0.0,2.0,...,22.0,746.0,15.0,0.0,31.0,98.0,217.0,339.0,0.0,177.0
1,Yunis Abdelhamid,MAR,DF,Ligue 1,2983,34,33.3,82.4,2.0,0.0,...,26.0,1087.0,67.0,0.0,43.0,187.0,303.0,100.0,1.0,224.0
2,Salis Abdul Samed,GHA,MF,Ligue 1,2462,31,27.8,91.7,1.0,0.0,...,1.0,621.0,24.0,0.0,45.0,184.0,214.0,20.0,3.0,240.0
3,Laurent Abergel,FRA,MF,Ligue 1,2956,34,23.3,81.6,0.0,2.0,...,3.0,450.0,14.0,0.0,46.0,205.0,308.0,28.0,0.0,291.0
4,Charles Abi,FRA,FW,Ligue 1,45,1,0.0,33.3,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0


In [79]:
dataset_23.head()

2689,Player,Nation,Pos,Comp,Min,MP,SoT%,PasTotCmp%,Goals,Assists,...,BlkSh,TouDef3rd,AerWon,Err,Fls,PasLonCmp,PasLonAtt,PasDead,CrdR,Recov
0,Brenden Aaronson,USA,MFFW,Premier League,1596,20,18.5,74.9,18.0,2.0,...,2.0,77.0,6.0,1.0,11.0,16.0,42.0,51.0,0.0,86.0
1,Yunis Abdelhamid,MAR,DF,Ligue 1,1980,22,5.3,81.5,0.0,1.0,...,17.0,605.0,48.0,1.0,29.0,83.0,154.0,82.0,0.0,146.0
2,Himad Abdelli,FRA,MFFW,Ligue 1,770,14,33.3,80.8,0.0,0.0,...,1.0,98.0,8.0,0.0,15.0,38.0,51.0,10.0,0.0,70.0
3,Salis Abdul Samed,GHA,MF,Ligue 1,1799,20,25.0,91.6,20.0,0.0,...,7.0,240.0,10.0,0.0,49.0,67.0,86.0,28.0,0.0,132.0
4,Laurent Abergel,FRA,MF,Ligue 1,1165,15,0.0,87.3,0.0,1.0,...,3.0,249.0,4.0,0.0,12.0,34.0,51.0,10.0,0.0,84.0
