In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from statsmodels.graphics.tsaplots import plot_acf
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.datasets import fetch_openml

In [78]:
match_data = pd.read_csv('/Users/jon/Documents/fpl_points_prediction/ERD/tables/match_data.csv', delimiter=";", index_col=False)
player_match_data = pd.read_csv('/Users/jon/Documents/fpl_points_prediction/ERD/tables/player_match_data_withid.csv', delimiter=";", index_col=False)

#Merge main two tables
merged_data = player_match_data.merge(match_data, left_on='match_id', right_on='match_id', how='outer')
merged_data = merged_data.drop('value', axis=1)
merged_data = merged_data.drop('own_goals', axis=1) #Variation isn't large enough 104
merged_data = merged_data.drop('red_cards', axis=1) #Variation isn't large enough 111
merged_data = merged_data.drop('npxG', axis=1) #not using npg
merged_data = merged_data.drop('player_match_id', axis=1)

#Turning 'home' and 'away' into players team and opponent team for analysis
merged_data['p_team_spi'] = merged_data.apply(lambda row: row.h_team_spi if row.was_home == 1 else row.a_team_spi, axis=1)
merged_data['oppn_spi'] = merged_data.apply(lambda row: row.a_team_spi if row.was_home == 1 else row.h_team_spi, axis=1)
merged_data['prob_p_team_win'] = merged_data.apply(lambda row: row.prob_h_win if row.was_home == 1 else row.prob_a_win, axis=1)
merged_data['prob_oppn_win'] = merged_data.apply(lambda row: row.prob_a_win if row.was_home == 1 else row.prob_h_win, axis=1)
merged_data['p_team_proj_score'] = merged_data.apply(lambda row: row.h_proj_score if row.was_home == 1 else row.a_proj_score, axis=1)
merged_data['oppn_team_proj_score'] = merged_data.apply(lambda row: row.a_proj_score if row.was_home == 1 else row.h_proj_score, axis=1)
merged_data['importance_p_team'] = merged_data.apply(lambda row: row.importance_h if row.was_home == 1 else row.importance_a, axis=1)
merged_data['importance_oppn_team'] = merged_data.apply(lambda row: row.importance_a if row.was_home == 1 else row.importance_h, axis=1)
#merged_data['score_p_team'] = merged_data.apply(lambda row: row.h_score if row.was_home == 1 else row.a_score, axis=1)
#merged_data['score_oppn_team'] = merged_data.apply(lambda row: row.a_score if row.was_home == 1 else row.h_score, axis=1)
#merged_data['xg_p_team'] = merged_data.apply(lambda row: row.h_xg if row.was_home == 1 else row.a_xg, axis=1)
#merged_data['xg_oppn_team'] = merged_data.apply(lambda row: row.a_xg if row.was_home == 1 else row.h_xg, axis=1)
#Additional features created
merged_data['opp_adv_spi'] = merged_data['oppn_spi'] - merged_data['p_team_spi']

#Creating lagged varibaled and lagged moving average varibales
#Sort values by player_id and date
lagged_data = merged_data.sort_values(['player_id', 'date'])

#Total points
def average_form(var):
    lagged_data[var+'-1'] = lagged_data.groupby('player_id')[var].shift(1) #Lagged once
    lagged_data[var+'-2'] = lagged_data.groupby('player_id')[var].shift(2) #Lagged twice
    lagged_data[var+'-3'] = lagged_data.groupby('player_id')[var].shift(3) #etc.
    lagged_data[var+'-4'] = lagged_data.groupby('player_id')[var].shift(4)
    lagged_data[var+'_lag_avg2'] = (lagged_data[var+'-1'] + lagged_data[var+'-2'])/2 #lagged 2 week moving average 
    lagged_data[var+'_lag_avg3'] = (lagged_data[var+'-1'] + lagged_data[var+'-2'] + lagged_data[var+'-3'])/3 #lagged 3 week moving average
    lagged_data[var+'_lag_avg4'] = (lagged_data[var+'-1'] + lagged_data[var+'-2'] + lagged_data[var+'-3'] + lagged_data[var+'-4'])/4 #etc.

for i, var in enumerate(['total_points','xP','bonus','bps','minutes','goals','shots','xG','xA','assists','key_passes','npg','xGChain','xGBuildup','yellow_cards','clean_sheets','goals_conceded','penalties_missed','penalties_saved','saves','influence','creativity','threat','ict_index']):
    average_form(var)

#remove date now used for lagging
lagged_data = lagged_data.drop('date', axis=1)
#Drop rows with NaN values caused by lagging.
lagged_data = lagged_data.dropna() 

#Drop game specific data that is not needed for the model
lagged_data = lagged_data.drop(['xP', 'minutes', 'bonus', 'bps', 'goals', 'shots', 'xG', 'xA',
       'assists', 'key_passes', 'npg', 'xGChain', 'xGBuildup', 'yellow_cards', 'clean_sheets', 'goals_conceded',
       'penalties_missed', 'penalties_saved', 'saves', 'influence',
       'creativity', 'threat', 'ict_index', 'round', 'h_team_spi', 'a_team_spi', 'prob_h_win', 'prob_a_win',
       'h_proj_score', 'a_proj_score', 'importance_h', 'importance_a',
       'h_score', 'a_score', 'h_xg', 'a_xg', 'match_id', 'season_id_y', 'h_nsxg','a_nsxg',], axis=1)

#'minutes', better with minutes included. 'season_id_x'
#Not dropping match_id as will need it later

#Replace total points with return
lagged_data['return'] = lagged_data['total_points'].apply(lambda x: 1 if x > 5 else 0)
#lagged_data['return'] = lagged_data['total_points'].apply(lambda x: '0' if x <= 2 else '1' if x <= 5 else '2' if x <= 8 else '3')

#Drop highly correlated columns and total points
lagged_data = lagged_data.drop(['prob_p_team_win', 'prob_oppn_win', 'total_points'], axis=1)

In [79]:
len(list(lagged_data.columns))

187

In [74]:
#Reduce features
lagged_data_fi = lagged_data.drop(column_importance_zero, axis=1)

#Limit time or not?
#lagged_data_fi = lagged_data_fi[lagged_data_fi['minutes_lag_avg4'] > 60]

RAND_STATE = 42 # for reproducible shuffling
TT_RATIO = 0.25 # test/train

# X,y
y = lagged_data['return']
X = lagged_data.drop(['return'], axis=1)
#X = X.drop(column_importance_zero, axis=1)

# test-train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TT_RATIO, random_state=RAND_STATE)

In [70]:
#undersampling
'''
def down_samp_rand(X, y, ratio=1):
        from imblearn.under_sampling import RandomUnderSampler
        rus = RandomUnderSampler(sampling_strategy=ratio, random_state=RAND_STATE)
        X_rus, y_rus = rus.fit_resample(X, y)
        return X_rus, y_rus

X_train, y_train = down_samp_rand(X_train,y_train)
y_train.value_counts()
'''


0    3795
1    3795
Name: return, dtype: int64

In [75]:
#Upsampling

from imblearn.over_sampling import SMOTE
smote = SMOTE()
#fit training data with upsampling
X_train, y_train = smote.fit_resample(X_train, y_train)
y_train.value_counts()


0    15045
1    15045
Name: return, dtype: int64

In [76]:
#Modelling using AdaBoost with Decision Tree

from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

#DT with all features
tree = DecisionTreeClassifier(criterion='gini', max_depth=1, random_state=42)
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

print("Decision Tree with Max Depth 1")
performance_log_data = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_train_pred),
                                         precision_score(y_train, y_train_pred, pos_label=1),
                                         recall_score(y_train, y_train_pred, pos_label=1)],
                               'Test': [accuracy_score(y_test, y_test_pred),
                                        precision_score(y_test, y_test_pred, pos_label=1),
                                        recall_score(y_test, y_test_pred, pos_label=1)]})

display(performance_log_data)

ada = AdaBoostClassifier(base_estimator=tree, n_estimators=1000, learning_rate=0.5, random_state=42)
ada.fit(X_train, y_train)
y_train_pred_ada = ada.predict(X_train)
y_test_pred_ada = ada.predict(X_test)

performance_log_data_ada = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_train_pred_ada),
                                         precision_score(y_train, y_train_pred_ada, pos_label=1),
                                         recall_score(y_train, y_train_pred_ada, pos_label=1)],
                               'Test': [accuracy_score(y_test, y_test_pred_ada),
                                        precision_score(y_test, y_test_pred_ada, pos_label=1),
                                        recall_score(y_test, y_test_pred_ada, pos_label=1)]})

print("Ada Boost Performance")
display(performance_log_data_ada)

Decision Tree with Max Depth 1


Unnamed: 0,Error_metric,Train,Test
0,Accuracy,0.639482,0.51465
1,Precision,0.607192,0.236575
2,Recall,0.790096,0.618565


Ada Boost Performance


Unnamed: 0,Error_metric,Train,Test
0,Accuracy,0.883583,0.794427
1,Precision,0.973188,0.486068
2,Recall,0.7889,0.122465


In [None]:
from sklearn.metrics import plot_confusion_matrix

fig, ax = plt.subplots(1,2, figsize=(14,4))
plot_confusion_matrix(ada,X_train,y_train,ax=ax[0], values_format = 'd')
ax[0].title.set_text("Train Set")
plot_confusion_matrix(ada,X_test,y_test,ax=ax[1],values_format = 'd')
ax[1].title.set_text("Test Set")

#Cross Validation
from sklearn.model_selection import cross_val_score
folds=3
cross_val_scores = cross_val_score(ada, X_train, y_train, cv=folds)
print("cv scores over {:d} iterations: \n".format(folds))
cross_val_scores

In [63]:
column_importance_zero = ['was_home',
 'season_id_x',
 'total_points-2',
 'total_points-3',
 'bonus-1',
 'bonus-2',
 'bonus-3',
 'bonus-4',
 'bonus_lag_avg2',
 'bonus_lag_avg3',
 'bonus_lag_avg4',
 'minutes-1',
 'minutes-2',
 'minutes-3',
 'minutes-4',
 'minutes_lag_avg2',
 'goals-1',
 'goals-2',
 'goals-3',
 'goals-4',
 'goals_lag_avg2',
 'goals_lag_avg3',
 'goals_lag_avg4',
 'shots-1',
 'shots-2',
 'shots-3',
 'shots-4',
 'shots_lag_avg2',
 'shots_lag_avg3',
 'shots_lag_avg4',
 'xG-1',
 'xG-2',
 'xG-3',
 'xA-1',
 'xA-2',
 'xA-4',
 'xA_lag_avg2',
 'assists-1',
 'assists-2',
 'assists-3',
 'assists-4',
 'assists_lag_avg2',
 'assists_lag_avg3',
 'assists_lag_avg4',
 'key_passes-1',
 'key_passes-2',
 'key_passes-3',
 'key_passes-4',
 'key_passes_lag_avg2',
 'key_passes_lag_avg3',
 'key_passes_lag_avg4',
 'npg-1',
 'npg-2',
 'npg-3',
 'npg-4',
 'npg_lag_avg2',
 'npg_lag_avg3',
 'npg_lag_avg4',
 'yellow_cards-1',
 'yellow_cards-2',
 'yellow_cards-3',
 'yellow_cards-4',
 'yellow_cards_lag_avg2',
 'yellow_cards_lag_avg3',
 'yellow_cards_lag_avg4',
 'clean_sheets-1',
 'clean_sheets-2',
 'clean_sheets-3',
 'clean_sheets-4',
 'clean_sheets_lag_avg2',
 'clean_sheets_lag_avg3',
 'clean_sheets_lag_avg4',
 'goals_conceded-1',
 'goals_conceded-2',
 'goals_conceded-3',
 'goals_conceded-4',
 'goals_conceded_lag_avg2',
 'goals_conceded_lag_avg3',
 'goals_conceded_lag_avg4',
 'penalties_missed-1',
 'penalties_missed-2',
 'penalties_missed-3',
 'penalties_missed-4',
 'penalties_missed_lag_avg2',
 'penalties_missed_lag_avg3',
 'penalties_missed_lag_avg4',
 'penalties_saved-1',
 'penalties_saved-2',
 'penalties_saved-3',
 'penalties_saved-4',
 'penalties_saved_lag_avg2',
 'penalties_saved_lag_avg3',
 'penalties_saved_lag_avg4',
 'saves-1',
 'saves-2',
 'saves-3',
 'saves-4',
 'saves_lag_avg2',
 'saves_lag_avg3',
 'saves_lag_avg4',
 'threat-1',
 'threat-2',
 'threat-3']