In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import matplotlib as mpl 

In [2]:
playerData = pd.read_csv("MasterStats.csv")

In [3]:
playerData.head()

Unnamed: 0,full_name,element_type,now_cost,total_points,minutes,goals_scored,assists,goals_conceded,clean_sheets,influence,...,player_name,time,xG,xA,shots,key_passes,yellow_cards,npxG,xGChain,xGBuildup
0,Çaglar Söyüncü,2,45,66,2497,1,0,44,5,606.4,...,Caglar Söyüncü,2500,1.008084,0.087372,8,3,5,1.008084,5.909221,5.82185
1,Çaglar Söyüncü,2,53,59,1816,1,0,30,5,403.6,...,Caglar Söyüncü,1802,0.539208,0.326975,10,7,2,0.539208,7.390435,7.368668
2,Aaron Connolly,4,52,38,783,2,1,17,1,104.2,...,Aaron Connolly,799,4.464137,0.160784,23,6,0,4.464137,4.783492,0.191225
3,Aaron Cresswell,2,57,153,3170,0,11,41,11,669.4,...,Aaron Cresswell,3176,0.883464,7.39094,19,58,3,0.883464,10.620283,8.392441
4,Aaron Cresswell,2,50,115,2726,2,4,38,7,589.2,...,Aaron Cresswell,2734,0.912433,3.670589,18,31,3,0.912433,10.543697,9.624681


In [4]:
del playerData['time']

In [5]:
positions = playerData.element_type.unique()

In [6]:
positionDataFrameDict = {elem : pd.DataFrame() for elem in positions}

In [7]:
for key in positionDataFrameDict.keys():
    positionDataFrameDict[key] = playerData[:][playerData.element_type == key]

In [8]:
#to illustrate the above, the following should produce a table of only goalkeepers:
positionDataFrameDict[1]

Unnamed: 0,full_name,element_type,now_cost,total_points,minutes,goals_scored,assists,goals_conceded,clean_sheets,influence,...,id,player_name,xG,xA,shots,key_passes,yellow_cards,npxG,xGChain,xGBuildup
5,Aaron Ramsdale,1,50,135,3060,0,0,39,12,690.2,...,5603,Aaron Ramsdale,0.0,0.0,0,0,1,0.0,3.995809,3.995809
6,Aaron Ramsdale,1,46,123,3420,0,0,63,5,1023.0,...,5603,Aaron Ramsdale,0.0,0.077672,0,2,1,0.0,2.899687,2.822016
27,Alex McCarthy,1,44,94,2700,0,0,57,7,741.2,...,635,Alex McCarthy,0.0,0.0,0,0,2,0.0,2.763433,2.763433
28,Alex McCarthy,1,45,55,1530,0,0,27,5,334.4,...,635,Alex McCarthy,0.0,0.023409,0,1,1,0.0,2.008902,1.985492
36,Alisson Ramses Becker,1,55,176,3240,0,1,24,20,721.4,...,1257,Alisson,0.0,0.128068,0,1,0,0.0,9.494231,9.366163
37,Alisson Ramses Becker,1,60,140,2970,1,0,32,10,776.4,...,1257,Alisson,0.107682,0.0,1,0,1,0.107682,3.559325,3.559325
42,Alphonse Areola,1,45,124,3240,0,0,48,9,876.2,...,2310,Alphonse Areola,0.0,0.0,0,0,2,0.0,3.55533,3.55533
72,Bailey Peacock-Farrell,1,40,3,360,0,0,14,0,82.6,...,8482,Bailey Peacock-Farrell,0.0,0.0,0,0,0,0.0,0.007731,0.007731
90,Bernd Leno,1,50,131,3131,0,0,37,11,702.2,...,181,Bernd Leno,0.0,0.0,0,0,0,0.0,4.326601,4.326601
91,Bernd Leno,1,45,10,360,0,0,9,1,85.0,...,181,Bernd Leno,0.0,0.0,0,0,0,0.0,0.189383,0.189383


As goalkeepers are slightly different in the way they score points, we will use a slightly different set of predictor variables in the goalkeeper model as in the other models.

In [9]:
GKtarget = positionDataFrameDict[1].total_points
GKpredictors_names = ['now_cost', 'minutes', 'goals_conceded', 'clean_sheets', 'influence', 'creativity', 'threat', 'xG', 'xA', 'key_passes', 'yellow_cards', 'xGChain', 'xGBuildup']
GKpredictors = positionDataFrameDict[1][GKpredictors_names]

In [10]:
train_X, val_X, train_y, val_y = train_test_split(GKpredictors, GKtarget, random_state=1)

In [11]:
GK_model = DecisionTreeRegressor(random_state=1)
GK_model.fit(train_X, train_y)

In [12]:
GKpredictions = GK_model.predict(val_X)

In [13]:
GKmae = mean_absolute_error(GKpredictions, val_y)
GKmae

14.538461538461538

In [14]:
positionDataFrameDict[1].describe()

Unnamed: 0,element_type,now_cost,total_points,minutes,goals_scored,assists,goals_conceded,clean_sheets,influence,creativity,threat,id,xG,xA,shots,key_passes,yellow_cards,npxG,xGChain,xGBuildup
count,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0
mean,1.0,48.469388,103.632653,2482.081633,0.020408,0.081633,36.244898,8.346939,632.028571,6.314286,0.632653,3822.142857,0.002198,0.050439,0.020408,0.612245,1.285714,0.002198,3.06245,3.021183
std,0.0,4.996257,49.076341,1056.493056,0.142857,0.276642,17.971657,5.109432,307.659297,10.069964,2.555173,3224.520507,0.015383,0.117101,0.142857,1.016865,1.06066,0.015383,2.222681,2.197975
min,1.0,40.0,3.0,360.0,0.0,0.0,2.0,0.0,82.6,0.0,0.0,181.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007731,0.007731
25%,1.0,45.0,57.0,1710.0,0.0,0.0,25.0,4.0,464.2,0.0,0.0,745.0,0.0,0.0,0.0,0.0,0.0,0.0,1.276374,1.276374
50%,1.0,48.0,124.0,3060.0,0.0,0.0,37.0,8.0,698.4,0.0,0.0,2385.0,0.0,0.0,0.0,0.0,1.0,0.0,2.763433,2.763433
75%,1.0,52.0,136.0,3330.0,0.0,0.0,48.0,11.0,846.6,10.0,0.0,6532.0,0.0,0.040996,0.0,1.0,2.0,0.0,4.120815,4.120815
max,1.0,61.0,186.0,3420.0,1.0,1.0,79.0,20.0,1213.6,43.0,17.0,9740.0,0.107682,0.636247,1.0,5.0,4.0,0.107682,9.952974,9.952974


The mean absolute error of the model tells us that, on average, the model's predicitons are 14 points away from the actual value. As shown above, goalkeepers score an average of 103.633 points per season, making the relative error only 14% of the average. This is a fairly accurate model, but by changing the parameters of the model, we may be able to increase the accuracy - we will do this below. 

In [15]:
def getMAEFromNodeNumber(nodes: int):
    GK_model = DecisionTreeRegressor(max_leaf_nodes=nodes, random_state=1)
    GK_model.fit(train_X, train_y)
    GKpredictions = GK_model.predict(val_X)
    GKmae = mean_absolute_error(GKpredictions, val_y)
    return GKmae
    
nodesToTest = [i for i in range(2, 21)]

bestMAE = 10000
bestNodes = 0

for nodesNum in nodesToTest:
    mae = getMAEFromNodeNumber(nodesNum)
    if mae < bestMAE:
        bestMAE = mae
        bestNodes = nodesNum

print(f"The best number of nodes is {bestNodes}, giving an mae of {bestMAE}")

The best number of nodes is 20, giving an mae of 14.3974358974359


We now know that the optimal number of nodes is 20, which gives an MAE of 14.39, or roughly 13.8% of the average. We will use this model as our final model.

In [16]:
GKmodel = DecisionTreeRegressor(max_leaf_nodes=20, random_state=1)
GKmodel.fit(GKpredictors, GKtarget)

In [17]:
predictors_names = ['now_cost', 'minutes', 'goals_scored', 'assists', 'goals_conceded', 'clean_sheets', 'influence', 'creativity', 'threat', 'xG', 'xA', 'shots', 'key_passes', 'yellow_cards', 'npxG', 'xGChain', 'xGBuildup']

DEFtarget = positionDataFrameDict[2].total_points
DEFpredictors = positionDataFrameDict[2][predictors_names]

In [18]:
train_X, val_X, train_y, val_y = train_test_split(DEFpredictors, DEFtarget, random_state=1)

def getMAEFromNodeNumber(nodes: int):
    DEF_model = DecisionTreeRegressor(max_leaf_nodes=nodes, random_state=1)
    DEF_model.fit(train_X, train_y)
    DEFpredictions = DEF_model.predict(val_X)
    DEFmae = mean_absolute_error(DEFpredictions, val_y)
    return DEFmae

nodesToTest = [i for i in range(2, 51)]

bestMAE = 10000
bestNodes = 0

for nodesNum in nodesToTest:
    mae = getMAEFromNodeNumber(nodesNum)
    if mae < bestMAE:
        bestMAE = mae
        bestNodes = nodesNum

print(f"The best number of nodes is {bestNodes}, giving an mae of {bestMAE}")

The best number of nodes is 32, giving an mae of 9.790444121606093


In [19]:
positionDataFrameDict[2].describe()

Unnamed: 0,element_type,now_cost,total_points,minutes,goals_scored,assists,goals_conceded,clean_sheets,influence,creativity,threat,id,xG,xA,shots,key_passes,yellow_cards,npxG,xGChain,xGBuildup
count,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0
mean,2.0,47.854093,68.697509,1807.359431,0.975089,1.41637,27.010676,5.967972,390.681851,187.139858,161.879004,4386.476868,1.074786,1.218651,12.295374,11.715302,3.096085,1.069369,5.980839,4.979837
std,0.0,6.144399,41.3945,868.883834,1.193473,2.05138,14.4947,4.257226,208.577109,216.654362,128.335941,3285.041329,1.14474,1.617171,10.752489,13.642531,2.316406,1.126983,4.527829,3.751925
min,2.0,38.0,4.0,330.0,0.0,0.0,2.0,0.0,48.0,1.9,0.0,76.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064611,0.064611
25%,2.0,45.0,36.0,996.0,0.0,0.0,15.0,2.0,200.2,56.4,64.0,885.0,0.337133,0.225423,5.0,3.0,1.0,0.337133,2.60645,2.167426
50%,2.0,45.0,63.0,1844.0,1.0,1.0,25.0,5.0,390.8,102.7,135.0,4391.0,0.784475,0.655803,10.0,7.0,3.0,0.765355,5.043203,4.156065
75%,2.0,50.0,96.0,2557.0,1.0,2.0,38.0,9.0,551.2,241.0,223.0,7430.0,1.447223,1.557334,16.0,15.0,4.0,1.44503,7.895178,6.755061
max,2.0,78.0,208.0,3410.0,8.0,12.0,71.0,21.0,865.2,1449.1,851.0,10291.0,9.653353,12.687638,81.0,90.0,11.0,9.653353,26.887523,21.152134


In [20]:
DEFmodel = DecisionTreeRegressor(max_leaf_nodes=32, random_state=1)
DEFmodel.fit(DEFpredictors, DEFtarget)

In [21]:
MIDtarget = positionDataFrameDict[3].total_points
MIDpredictors = positionDataFrameDict[3][predictors_names]
train_X, val_X, train_y, val_y = train_test_split(MIDpredictors, MIDtarget, random_state=1)

def getMAEFromNodeNumber(nodes: int):
    MID_model = DecisionTreeRegressor(max_leaf_nodes=nodes, random_state=1)
    MID_model.fit(train_X, train_y)
    MIDpredictions = MID_model.predict(val_X)
    MIDmae = mean_absolute_error(MIDpredictions, val_y)
    return MIDmae

nodesToTest = [i for i in range(2, 51)]

bestMAE = 10000
bestNodes = 0

for nodesNum in nodesToTest:
    mae = getMAEFromNodeNumber(nodesNum)
    if mae < bestMAE:
        bestMAE = mae
        bestNodes = nodesNum

print(f"The best number of nodes is {bestNodes}, giving an mae of {bestMAE}")

The best number of nodes is 39, giving an mae of 10.258796656018877


In [22]:
positionDataFrameDict[3].describe()

Unnamed: 0,element_type,now_cost,total_points,minutes,goals_scored,assists,goals_conceded,clean_sheets,influence,creativity,threat,id,xG,xA,shots,key_passes,yellow_cards,npxG,xGChain,xGBuildup
count,322.0,322.0,322.0,322.0,322.0,322.0,322.0,322.0,322.0,322.0,322.0,322.0,322.0,322.0,322.0,322.0,322.0,322.0,322.0,322.0
mean,3.0,57.456522,76.888199,1729.636646,3.189441,3.031056,25.388199,6.130435,376.099379,375.384783,369.152174,4304.854037,3.135806,2.575186,29.586957,24.39441,2.981366,2.811955,8.887331,5.01557
std,0.0,14.753414,45.745401,807.041732,3.962222,2.919314,13.638921,3.923935,235.962035,270.266885,330.792919,3244.699357,3.413217,2.209112,22.905472,18.476803,2.517161,2.977462,5.985376,3.614473
min,3.0,41.0,8.0,307.0,0.0,0.0,2.0,0.0,13.6,13.5,0.0,87.0,0.0,0.0,0.0,0.0,0.0,0.0,0.422484,0.10413
25%,3.0,50.0,41.25,1096.5,0.25,1.0,15.0,3.0,179.95,161.3,130.25,775.0,0.816706,0.957511,13.25,10.0,1.0,0.766324,4.436822,2.462495
50%,3.0,54.0,70.0,1682.0,2.0,2.0,24.0,6.0,349.2,319.75,284.0,5220.5,1.987776,2.0052,23.0,20.0,3.0,1.805837,7.787331,4.073399
75%,3.0,60.0,101.0,2386.0,5.0,4.0,34.0,9.0,521.45,520.575,481.75,7162.75,4.258121,3.689828,43.0,34.0,4.0,3.793918,11.466703,6.765273
max,3.0,130.0,265.0,3420.0,23.0,17.0,68.0,18.0,1292.6,1414.9,2230.0,10408.0,24.364297,11.474996,139.0,95.0,12.0,19.797284,32.158807,23.955902


In [23]:
MIDmodel = DecisionTreeRegressor(max_leaf_nodes=39, random_state=1)
MIDmodel.fit(MIDpredictors, MIDtarget)

In [24]:
ATTtarget = positionDataFrameDict[4].total_points
ATTpredictors = positionDataFrameDict[4][predictors_names]
train_X, val_X, train_y, val_y = train_test_split(ATTpredictors, ATTtarget, random_state=1)

def getMAEFromNodeNumber(nodes: int):
    ATT_model = DecisionTreeRegressor(max_leaf_nodes=nodes, random_state=1)
    ATT_model.fit(train_X, train_y)
    ATTpredictions = ATT_model.predict(val_X)
    ATTmae = mean_absolute_error(ATTpredictions, val_y)
    return ATTmae

nodesToTest = [i for i in range(10, 51)]

bestMAE = 10000
bestNodes = 0

for nodesNum in nodesToTest:
    mae = getMAEFromNodeNumber(nodesNum)
    if mae < bestMAE:
        bestMAE = mae
        bestNodes = nodesNum

print(f"The best number of nodes is {bestNodes}, giving an mae of {bestMAE}")

The best number of nodes is 40, giving an mae of 16.642857142857142


In [25]:
positionDataFrameDict[4].describe()

Unnamed: 0,element_type,now_cost,total_points,minutes,goals_scored,assists,goals_conceded,clean_sheets,influence,creativity,threat,id,xG,xA,shots,key_passes,yellow_cards,npxG,xGChain,xGBuildup
count,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0
mean,4.0,68.096386,89.710843,1694.084337,6.650602,3.674699,26.144578,5.542169,383.277108,270.274699,718.819277,3647.024096,7.565397,2.29947,47.361446,18.373494,2.373494,6.886767,10.344338,2.838752
std,0.0,15.963842,46.871264,814.340496,4.852542,3.231357,12.811135,3.778312,250.911936,166.314404,412.13768,3062.748897,5.085905,1.955053,28.148811,12.128088,2.162377,4.500161,6.75266,2.319425
min,4.0,42.0,13.0,305.0,0.0,0.0,2.0,0.0,21.0,26.9,142.0,65.0,0.603317,0.056096,7.0,2.0,0.0,0.603317,0.989015,0.086718
25%,4.0,55.0,53.0,1116.5,3.0,1.0,16.5,3.0,179.6,131.2,405.5,659.5,3.674926,0.940529,26.0,9.0,1.0,3.54786,5.152175,1.140691
50%,4.0,65.0,87.0,1477.0,5.0,3.0,25.0,5.0,319.8,242.7,617.0,3294.0,6.189151,1.665206,42.0,16.0,2.0,5.84697,8.729951,2.162384
75%,4.0,78.5,121.5,2379.0,9.5,5.0,37.5,8.5,548.8,362.85,965.5,6085.0,10.342017,3.120362,62.0,25.0,3.0,9.394748,13.773423,3.524607
max,4.0,119.0,242.0,3328.0,23.0,14.0,51.0,14.0,1318.2,674.0,1798.0,9738.0,22.174859,9.818288,138.0,49.0,8.0,19.130183,31.892183,11.412372


In [26]:
ATTmodel = DecisionTreeRegressor(max_leaf_nodes=40, random_state=1)
ATTmodel.fit(ATTpredictors, ATTtarget)

In [27]:
models = [GKmodel, DEFmodel, MIDmodel, ATTmodel]
%store models

Stored 'models' (list)
