In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
file_path = '/content/fbref_PL_2024-25.csv'
df = pd.read_csv(file_path)

In [None]:
df.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,Gls.1,Ast.1,G+A.1,G-PK.1,G+A-PK,xG.1,xAG.1,xG+xAG,npxG.1,npxG+xAG.1
0,1,Max Aarons,eng ENG,DF,Bournemouth,24.0,2000.0,3,1,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Joshua Acheampong,eng ENG,DF,Chelsea,18.0,2006.0,4,2,170,...,0.0,0.0,0.0,0.0,0.0,0.12,0.0,0.12,0.12,0.12
2,3,Tyler Adams,us USA,MF,Bournemouth,25.0,1999.0,28,21,1965,...,0.0,0.14,0.14,0.0,0.14,0.07,0.05,0.12,0.07,0.12
3,4,Tosin Adarabioyo,eng ENG,DF,Chelsea,26.0,1997.0,22,15,1409,...,0.06,0.06,0.13,0.06,0.13,0.06,0.01,0.07,0.06,0.07
4,5,Simon Adingra,ci CIV,"FW,MF",Brighton,22.0,2002.0,29,12,1097,...,0.16,0.16,0.33,0.16,0.33,0.2,0.2,0.4,0.2,0.4




# Exporlitory Data Analysis (EDA) And Data Validation

In [None]:
print(df['Player'].duplicated(keep = False).sum())

24


In [None]:
duplicated_players = df[df['Player'].duplicated(keep = False)].sort_values(by = 'Player')
print(duplicated_players)

      Rk                  Player   Nation    Pos            Squad   Age  \
138  139             Axel Disasi   fr FRA     DF          Chelsea  26.0   
137  138             Axel Disasi   fr FRA     DF      Aston Villa  26.0   
12    13          Carlos Alcaraz   ar ARG     MF      Southampton  21.0   
13    14          Carlos Alcaraz   ar ARG  FW,MF          Everton  21.0   
179  180           Evan Ferguson   ie IRL     FW         Brighton  19.0   
178  179           Evan Ferguson   ie IRL     FW         West Ham  19.0   
416  417  Jaden Philogene Bidace  eng ENG  FW,DF     Ipswich Town  22.0   
415  416  Jaden Philogene Bidace  eng ENG  FW,MF      Aston Villa  22.0   
545  546       James Ward-Prowse  eng ENG     MF         West Ham  29.0   
546  547       James Ward-Prowse  eng ENG     MF  Nott'ham Forest  29.0   
24    25        Joachim Andersen   dk DEN     DF           Fulham  28.0   
23    24        Joachim Andersen   dk DEN     DF   Crystal Palace  28.0   
41    42             Jord

It seems that the duplicate players are ones that transfered to another club during the season. Since the players performance at those different clubs are relevant, I will still include them in the data

In [None]:
print(df.isnull().sum())

Rk            0
Player        0
Nation        4
Pos           0
Squad         0
Age           4
Born          4
MP            0
Starts        0
Min           0
90s           0
Gls           0
Ast           0
G+A           0
G-PK          0
PK            0
PKatt         0
CrdY          0
CrdR          0
xG            0
npxG          0
xAG           0
npxG+xAG      0
PrgC          0
PrgP          0
PrgR          0
Gls.1         0
Ast.1         0
G+A.1         0
G-PK.1        0
G+A-PK        0
xG.1          0
xAG.1         0
xG+xAG        0
npxG.1        0
npxG+xAG.1    0
dtype: int64


In [None]:
null_values = df[df.isnull().any(axis = 1)]
print(null_values)

      Rk         Player Nation    Pos           Squad  Age  Born  MP  Starts  \
17    18  Olabade Aluko    NaN     DF  Leicester City  NaN   NaN   1       0   
171  172     Jake Evans    NaN     FW  Leicester City  NaN   NaN   4       0   
327  328    Mateus Mane    NaN     MF          Wolves  NaN   NaN   1       0   
359  360   Jeremy Monga    NaN  FW,MF  Leicester City  NaN   NaN   7       0   

     Min  ...  Gls.1  Ast.1  G+A.1  G-PK.1  G+A-PK  xG.1  xAG.1  xG+xAG  \
17     2  ...    0.0    0.0    0.0     0.0     0.0  0.00   0.00    0.00   
171   24  ...    0.0    0.0    0.0     0.0     0.0  0.00   0.00    0.00   
327    2  ...    0.0    0.0    0.0     0.0     0.0  0.00   0.00    0.00   
359  112  ...    0.0    0.0    0.0     0.0     0.0  0.13   0.01    0.14   

     npxG.1  npxG+xAG.1  
17     0.00        0.00  
171    0.00        0.00  
327    0.00        0.00  
359    0.13        0.14  

[4 rows x 36 columns]


The values that have the "NaN" value are not relevant, so I will just drop them

In [None]:
null_values = df[df.isnull().any(axis = 1)]
df = df.dropna()

Now to verify the dataset

In [None]:
y = df['Gls']
X = df.select_dtypes(include=np.number).drop(columns=['Rk', 'Born', 'Gls'], errors='ignore')
X = X.fillna(X.mean())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    'Linear Regression' : LinearRegression(),
    'Lasso Regression' : Lasso(),
    'Ridge Regression' : Ridge(),
    'Decision Tree' : DecisionTreeRegressor(random_state = 42),
    'Random Forest' : RandomForestRegressor(random_state = 42),
    'Gradient Boosting' : GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{name}")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R² Score: {r2:.2f}")
    print("---------------------------------")

Linear Regression
Mean Absolute Error: 0.00
Mean Squared Error: 0.00
R² Score: 1.00
---------------------------------
Lasso Regression
Mean Absolute Error: 0.31
Mean Squared Error: 0.41
R² Score: 0.97
---------------------------------
Ridge Regression
Mean Absolute Error: 0.00
Mean Squared Error: 0.00
R² Score: 1.00
---------------------------------
Decision Tree
Mean Absolute Error: 0.26
Mean Squared Error: 1.05
R² Score: 0.93
---------------------------------
Random Forest
Mean Absolute Error: 0.13
Mean Squared Error: 0.18
R² Score: 0.99
---------------------------------
Gradient Boosting
Mean Absolute Error: 0.07
Mean Squared Error: 0.07
R² Score: 1.00
---------------------------------
Support Vector Regressor
Mean Absolute Error: 1.75
Mean Squared Error: 11.79
R² Score: 0.19
---------------------------------


With the exception of the support vector regressor, all the models showcase a near perfect R^2 score, with both linear and ridge regression being perfect. This indicates that the dataset has a high explanatory power and is highly linear



# Model Training

In [None]:
outputs_df = df.groupby(['Player']).agg({'G+A.1':'mean','xG+xAG':'mean','90s':'sum'})
outputs_df = outputs_df.rename(columns={'G+A.1':'G/A Per 90','xG+xAG':'Expected G/A per 90', '90s':'90s Played'})
outputs_df = outputs_df.sort_values(by = 'G/A Per 90', ascending = False)
outputs_df = outputs_df.reset_index()
outputs_df

Unnamed: 0,Player,G/A Per 90,Expected G/A per 90,90s Played
0,Dane Scarlett,2.65,0.22,0.4
1,Matheus França,1.64,0.42,0.6
2,Romain Esse,1.30,0.48,1.5
3,Mohamed Salah,1.25,1.05,37.5
4,Jáder Durán,0.99,0.71,7.1
...,...,...,...,...
553,Jonny Evans,0.00,0.04,3.5
554,Aaron Ramsdale,0.00,0.00,30.0
555,Jorge Cuenca,0.00,0.01,3.9
556,Alfie Dorrington,0.00,0.00,0.2


As evident by the first 2 players showcased, there are a lot of players who, despite having a high G/A per 90, have not played a lot. Therefore, I will drop players from the dataset who do not have 10 or more 90s played. This should filter out the players who did not play a substantial amount of games and reduce bias in the model.     

In [None]:
outputs_df = outputs_df[outputs_df['90s Played'] >= 10]
outputs_df

Unnamed: 0,Player,G/A Per 90,Expected G/A per 90,90s Played
3,Mohamed Salah,1.25,1.050,37.5
6,Alexander Isak,0.95,0.810,30.6
11,Rodrigo Muniz,0.84,0.610,10.7
12,Ollie Watkins,0.83,0.650,28.9
13,Bukayo Saka,0.83,0.750,19.2
...,...,...,...,...
537,Cheick Doucouré,0.00,0.080,10.0
544,José Sá,0.00,0.010,29.0
545,Joël Veltman,0.00,0.050,18.9
548,Joachim Andersen,0.00,0.045,29.7


Now I want to see how players performed vs their expected performance by comparing their G/A Per 90 vs expected G/A Per 90

In [None]:
df['Real vs Expected G/A Per 90'] = df['G+A.1'] - df['xG+xAG']
df[['Player','Real vs Expected G/A Per 90','90s']].sort_values(by = 'Real vs Expected G/A Per 90', ascending = False)

Unnamed: 0,Player,Real vs Expected G/A Per 90,90s
472,Dane Scarlett,2.43,0.4
195,Matheus França,1.22,0.6
173,Romain Esse,0.82,1.5
204,Tyrique George,0.52,2.0
481,Ryan Sessegnon,0.51,6.4
...,...,...,...
369,Jakub Moder,-1.45,0.1
12,Carlos Alcaraz,-1.48,0.1
39,Jordan Ayew,-2.17,0.2
529,Takehiro Tomiyasu,-2.37,0.1


Now let's do the same thing and filter out the players who played less than 10 90s

In [None]:
Best_RVEGA90 = df[['Player','G+A.1', 'xG+xAG', 'Real vs Expected G/A Per 90','90s']].sort_values(by = 'Real vs Expected G/A Per 90', ascending = False)
Best_RVEGA90 = Best_RVEGA.rename(columns={'G+A.1':'G/A Per 90','xG+xAG':'Expected G/A Per 90'})
Best_RVEGA90 = Best_RVEGA[Best_RVEGA['90s'] >= 10]
Best_RVEGA90

Unnamed: 0,Player,G/A Per 90,Expected G/A Per 90,Real vs Expected G/A Per 90,90s
330,James Maddison,0.80,0.50,0.30,20.1
356,Dwight McNeil,0.66,0.41,0.25,15.2
166,Anthony Elanga,0.61,0.36,0.25,27.8
379,Rodrigo Muniz,0.84,0.61,0.23,10.7
135,Amad Diallo,0.66,0.45,0.21,21.1
...,...,...,...,...,...
322,Jesper Lindstrøm,0.07,0.30,-0.23,13.8
520,Mathys Tel,0.30,0.55,-0.25,10.1
29,Cameron Archer,0.12,0.39,-0.27,16.0
237,Jack Harrison,0.04,0.31,-0.27,23.1


I also want to see the season total

In [None]:
df['Real vs Expected G/A'] = df['G+A'] - df['npxG+xAG']

In [None]:
Best_RVEGA = df[['Player','G+A', 'npxG+xAG', 'Real vs Expected G/A','90s']].sort_values(by = 'Real vs Expected G/A', ascending = False)
Best_RVEGA = Best_RVEGA.rename(columns={'G+A':'G/A','npxG+xAG':'Expected G/A'})
Best_RVEGA = Best_RVEGA[Best_RVEGA['90s'] >= 10]
Best_RVEGA

Unnamed: 0,Player,G/A,Expected G/A,Real vs Expected G/A,90s
462,Mohamed Salah,47,32.4,14.6,37.5
351,Bryan Mbeumo,27,15.9,11.1,37.9
580,Chris Wood,23,13.6,9.4,32.9
299,Justin Kluivert,18,10.5,7.5,26.0
261,Alexander Isak,29,21.6,7.4,30.6
...,...,...,...,...,...
331,Noni Madueke,10,13.9,-3.9,22.6
451,Andrew Robertson,1,5.8,-4.8,27.6
450,Andrew Robertson,1,5.8,-4.8,27.6
469,Ismaila Sarr,14,19.2,-5.2,30.1
