In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from statsmodels.graphics.tsaplots import plot_acf
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.datasets import fetch_openml

In [2]:
teams = pd.read_csv('/Users/jon/Documents/fpl_points_prediction/ERD/tables/teams.csv', delimiter=";", index_col=False)
positions = pd.read_csv('/Users/jon/Documents/fpl_points_prediction/ERD/tables/positions.csv', delimiter=";", index_col=False)
seasons = pd.read_csv('/Users/jon/Documents/fpl_points_prediction/ERD/tables/years.csv', delimiter=";", index_col=False)
players = pd.read_csv('/Users/jon/Documents/fpl_points_prediction/ERD/tables/players.csv', delimiter=";", index_col=False)
match_data = pd.read_csv('/Users/jon/Documents/fpl_points_prediction/ERD/tables/match_data.csv', delimiter=";", index_col=False)
player_match_data = pd.read_csv('/Users/jon/Documents/fpl_points_prediction/ERD/tables/player_match_data_withid.csv', delimiter=";", index_col=False)

In [3]:
#Merge main two tables
merged_data = player_match_data.merge(match_data, left_on='match_id', right_on='match_id', how='outer')
merged_data = merged_data.drop('season_id_y', axis=1)
merged_data = merged_data.drop('value', axis=1)
merged_data = merged_data.drop('own_goals', axis=1) #Variation isn't large enough 104
merged_data = merged_data.drop('red_cards', axis=1) #Variation isn't large enough 111
merged_data = merged_data.drop('npxG', axis=1) #not using npg
merged_data = merged_data.drop('h_nsxg', axis=1) #not using nsxg
merged_data = merged_data.drop('a_nsxg', axis=1) #not using nsxg
merged_data = merged_data.drop('player_match_ID', axis=1)
merged_data = merged_data.drop_duplicates()

In [4]:
merged_data['p_team_spi'] = merged_data.apply(lambda row: row.h_team_spi if row.was_home == 1 else row.a_team_spi, axis=1)
merged_data['oppn_spi'] = merged_data.apply(lambda row: row.a_team_spi if row.was_home == 1 else row.h_team_spi, axis=1)
merged_data['prob_p_team_win'] = merged_data.apply(lambda row: row.prob_h_win if row.was_home == 1 else row.prob_a_win, axis=1)
merged_data['prob_oppn_win'] = merged_data.apply(lambda row: row.prob_a_win if row.was_home == 1 else row.prob_h_win, axis=1)
merged_data['p_team_proj_score'] = merged_data.apply(lambda row: row.h_proj_score if row.was_home == 1 else row.a_proj_score, axis=1)
merged_data['oppn_team_proj_score'] = merged_data.apply(lambda row: row.a_proj_score if row.was_home == 1 else row.h_proj_score, axis=1)
merged_data['importance_p_team'] = merged_data.apply(lambda row: row.importance_h if row.was_home == 1 else row.importance_a, axis=1)
merged_data['importance_oppn_team'] = merged_data.apply(lambda row: row.importance_a if row.was_home == 1 else row.importance_h, axis=1)
merged_data['score_p_team'] = merged_data.apply(lambda row: row.h_score if row.was_home == 1 else row.a_score, axis=1)
merged_data['score_oppn_team'] = merged_data.apply(lambda row: row.a_score if row.was_home == 1 else row.h_score, axis=1)
merged_data['xg_p_team'] = merged_data.apply(lambda row: row.h_xg if row.was_home == 1 else row.a_xg, axis=1)
merged_data['xg_oppn_team'] = merged_data.apply(lambda row: row.a_xg if row.was_home == 1 else row.h_xg, axis=1)
merged_data['opp_adv_spi'] = merged_data['oppn_spi'] - merged_data['p_team_spi']

In [49]:
#Creating lagged varibaled and lagged moving average varibales
#Sort values by player_id and date
lagged_data = merged_data.sort_values(['player_id', 'date'])

#Total points
def average_form(var):
    l1 = lagged_data.groupby('player_id')[var].shift(1) #Lagged once
    l2 = lagged_data.groupby('player_id')[var].shift(2) #Lagged twice
    l3 = lagged_data.groupby('player_id')[var].shift(3) #etc.
    l4 = lagged_data.groupby('player_id')[var].shift(4)
    lagged_data[var+'_lag_avg4'] = (l1+l2+l3+l4)/4 #etc.

for i, var in enumerate(['total_points','xP','bonus','bps','minutes','goals','shots','xG','xA','assists','key_passes','npg','xGChain','xGBuildup','yellow_cards','clean_sheets','goals_conceded','penalties_missed','penalties_saved','saves','influence','creativity','threat','ict_index']):
    average_form(var)

In [50]:
lagged_data['return'] = lagged_data['total_points'].apply(lambda x: 1 if x > 5 else 0)

In [51]:
lagged_data = lagged_data.drop(['date', 'prob_p_team_win', 'prob_oppn_win', 'total_points', 'xP', 'bonus', 'bps', 'goals', 'shots', 'xG', 'xA',
       'assists', 'key_passes', 'npg', 'xGChain', 'xGBuildup', 'yellow_cards', 'clean_sheets', 'goals_conceded',
       'penalties_missed', 'penalties_saved', 'saves', 'influence',
       'creativity', 'threat', 'ict_index', 'h_team_spi', 'a_team_spi', 'prob_h_win', 'prob_a_win',
       'h_proj_score', 'a_proj_score', 'importance_h', 'importance_a',
       'h_score', 'a_score', 'h_xg', 'a_xg', 'score_p_team', 'score_oppn_team', 'xg_p_team', 'xg_oppn_team', 
       'match_id', 'player_id', 'player_team_id', 'season_id_x', 'round', 'h_team_id', 'a_team_id'], axis=1)

In [52]:
#lagged_data = lagged_data[lagged_data['position_id'] == 1]
#lagged_data = lagged_data[lagged_data['position_id'] == 1]
lagged_data = lagged_data.dropna()
lagged_data = lagged_data.drop(['minutes'], axis=1)
#lagged_data = lagged_data.drop(column_importance_zero, axis=1)

In [53]:
RAND_STATE = 42 # for reproducible shuffling
TT_RATIO = 0.20 # test/train
y = lagged_data['return']
X = lagged_data.drop(['return'], axis=1)

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TT_RATIO, random_state=RAND_STATE)

In [55]:
def down_samp_rand(X, y, ratio=1):
        from imblearn.under_sampling import RandomUnderSampler
        rus = RandomUnderSampler(sampling_strategy=ratio, random_state=RAND_STATE)
        X_rus, y_rus = rus.fit_resample(X, y)
        return X_rus, y_rus

X_train, y_train = down_samp_rand(X_train,y_train)
y_train.value_counts()

0    4063
1    4063
Name: return, dtype: int64

In [57]:
#Modelling using AdaBoost with Decision Tree

from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

#DT with all features
tree = DecisionTreeClassifier(criterion='gini', max_depth=1, random_state=42)
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

print("Decision Tree with Max Depth 1")
performance_log_data = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_train_pred),
                                         precision_score(y_train, y_train_pred, pos_label=1),
                                         recall_score(y_train, y_train_pred, pos_label=1)],
                               'Test': [accuracy_score(y_test, y_test_pred),
                                        precision_score(y_test, y_test_pred, pos_label=1),
                                        recall_score(y_test, y_test_pred, pos_label=1)]})

display(performance_log_data)

Decision Tree with Max Depth 1


Unnamed: 0,Error_metric,Train,Test
0,Accuracy,0.607556,0.553742
1,Precision,0.591653,0.266717
2,Recall,0.694315,0.692308


In [58]:
#Feature importance for Decision Tree

importances = tree.feature_importances_
sorted_idx = importances.argsort()[::-1]
for i in sorted_idx:
    print("{}: {}".format(X.columns[i], importances[i]))

#This will return only 1 important value because it had depth 1.

transfers_in: 1.0
ict_index_lag_avg4: 0.0
oppn_team_proj_score: 0.0
bonus_lag_avg4: 0.0
xP_lag_avg4: 0.0
total_points_lag_avg4: 0.0
opp_adv_spi: 0.0
importance_oppn_team: 0.0
importance_p_team: 0.0
p_team_proj_score: 0.0
minutes_lag_avg4: 0.0
oppn_spi: 0.0
p_team_spi: 0.0
prob_tie: 0.0
transfers_out: 0.0
selected: 0.0
was_home: 0.0
bps_lag_avg4: 0.0
goals_lag_avg4: 0.0
threat_lag_avg4: 0.0
shots_lag_avg4: 0.0
creativity_lag_avg4: 0.0
influence_lag_avg4: 0.0
saves_lag_avg4: 0.0
penalties_saved_lag_avg4: 0.0
penalties_missed_lag_avg4: 0.0
goals_conceded_lag_avg4: 0.0
clean_sheets_lag_avg4: 0.0
yellow_cards_lag_avg4: 0.0
xGBuildup_lag_avg4: 0.0
xGChain_lag_avg4: 0.0
npg_lag_avg4: 0.0
key_passes_lag_avg4: 0.0
assists_lag_avg4: 0.0
xA_lag_avg4: 0.0
xG_lag_avg4: 0.0
position_id: 0.0


In [59]:
ada = AdaBoostClassifier(base_estimator=tree, n_estimators=5000, learning_rate=0.5, random_state=42)
ada.fit(X_train, y_train)
y_train_pred_ada = ada.predict(X_train)
y_test_pred_ada = ada.predict(X_test)

performance_log_data_ada = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_train_pred_ada),
                                         precision_score(y_train, y_train_pred_ada, pos_label=1),
                                         recall_score(y_train, y_train_pred_ada, pos_label=1)],
                               'Test': [accuracy_score(y_test, y_test_pred_ada),
                                        precision_score(y_test, y_test_pred_ada, pos_label=1),
                                        recall_score(y_test, y_test_pred_ada, pos_label=1)]})

display(performance_log_data_ada)

Unnamed: 0,Error_metric,Train,Test
0,Accuracy,0.750923,0.640924
1,Precision,0.744309,0.315248
2,Recall,0.76446,0.664694


In [48]:
feature_names = X_train.columns
df = pd.DataFrame(list(zip(feature_names, ada.feature_importances_)))
df.columns = ['columns_name', 'score_feature_importance']
df.sort_values(by=['score_feature_importance'], ascending = False)
column_importance_zero = df[df['score_feature_importance'] < 0.05]['columns_name'].tolist()
column_importance_zero

[]