In [1]:
## Importing required libraries
import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from time import time
from sklearn.decomposition import PCA, FastICA
from sklearn.pipeline import Pipeline
import warnings

warnings.simplefilter("ignore")

In [2]:
database = 'database.sqlite'
conn = sqlite3.connect(database)

In [3]:
#Fetching required data tables
country_data = pd.read_sql("SELECT * FROM Country;", conn)
league_data = pd.read_sql("SELECT * FROM League;", conn)
match_data = pd.read_sql("SELECT * FROM Match;", conn)
player_data = pd.read_sql("SELECT * FROM Player;", conn)
player_attr_data = pd.read_sql("SELECT * FROM Player_Attributes;", conn)
team_data = pd.read_sql("SELECT * FROM Team;", conn)
team_attr_data = pd.read_sql("SELECT * FROM Team_Attributes;", conn)

In [4]:
match_2015_season = match_data[match_data['season'] == '2015/2016']
epl_2015_season = match_2015_season[match_2015_season['country_id'] == 1729]

In [5]:
epl_2015_season

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
4388,4389,1729,1729,2015/2016,1,2015-08-09 00:00:00,1987032,9825,8654,0,...,,1.30,5.75,12.00,,,,,,
4389,4390,1729,1729,2015/2016,1,2015-08-08 00:00:00,1987033,8678,10252,0,...,,2.00,3.50,4.20,,,,,,
4390,4391,1729,1729,2015/2016,1,2015-08-08 00:00:00,1987034,8455,10003,2,...,,1.40,5.00,9.50,,,,,,
4391,4392,1729,1729,2015/2016,1,2015-08-08 00:00:00,1987035,8668,9817,2,...,,1.73,3.90,5.40,,,,,,
4392,4393,1729,1729,2015/2016,1,2015-08-08 00:00:00,1987036,8197,8472,4,...,,2.00,3.40,4.33,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4763,4764,1729,1729,2015/2016,9,2015-10-17 00:00:00,1988795,8466,8197,2,...,,1.75,3.90,5.00,,,,,,
4764,4765,1729,1729,2015/2016,9,2015-10-19 00:00:00,1988796,10003,10194,0,...,,2.05,3.40,4.10,,,,,,
4765,4766,1729,1729,2015/2016,9,2015-10-17 00:00:00,1988797,8586,8650,0,...,,2.45,3.50,3.00,,,,,,
4766,4767,1729,1729,2015/2016,9,2015-10-17 00:00:00,1988798,9817,9825,0,...,,6.25,4.20,1.60,,,,,,


In [6]:
season_sliced_columns = ['season', 'match_api_id', 'home_team_api_id', 'away_team_api_id', 'home_team_goal', 'away_team_goal',\
                 'home_player_1', 'home_player_2','home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',\
                 'home_player_7', 'home_player_8', 'home_player_9','home_player_10', 'home_player_11', 'away_player_1',\
                 'away_player_2', 'away_player_3', 'away_player_4', 'away_player_5','away_player_6', 'away_player_7',\
                 'away_player_8', 'away_player_9','away_player_10', 'away_player_11']

In [7]:
epl_2015_season_sliced = epl_2015_season[season_sliced_columns]

In [8]:
epl_2015_season_sliced

Unnamed: 0,season,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_1,home_player_2,home_player_3,home_player_4,...,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11
4388,2015/2016,1987032,9825,8654,0,2,30859.0,26154.0,35606.0,46539.0,...,109897.0,35110.0,49543.0,155782.0,37169.0,575789.0,148302.0,25496.0,18506.0,192899.0
4389,2015/2016,1987033,8678,10252,0,1,30974.0,35515.0,24625.0,156013.0,...,183500.0,24208.0,161414.0,473853.0,261313.0,179410.0,182223.0,23991.0,154280.0,23264.0
4390,2015/2016,1987034,8455,10003,2,2,170323.0,31306.0,23783.0,30627.0,...,111800.0,155050.0,24948.0,102356.0,127130.0,144996.0,95955.0,157729.0,52563.0,26344.0
4391,2015/2016,1987035,8668,9817,2,2,31465.0,77690.0,263653.0,23268.0,...,213809.0,41927.0,40548.0,35712.0,41365.0,30966.0,24915.0,37411.0,71724.0,72436.0
4392,2015/2016,1987036,8197,8472,4,2,37770.0,67850.0,38899.0,23571.0,...,22964.0,26108.0,165526.0,180330.0,35443.0,25150.0,109058.0,24159.0,30348.0,42598.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4763,2015/2016,1988795,8466,8197,2,2,35496.0,160627.0,26552.0,209405.0,...,49571.0,23571.0,38899.0,43061.0,139671.0,173317.0,319300.0,214570.0,20694.0,286119.0
4764,2015/2016,1988796,10003,10194,0,1,30973.0,89185.0,155050.0,24948.0,...,34036.0,119541.0,200962.0,37194.0,23253.0,39109.0,176300.0,96643.0,110148.0,172321.0
4765,2015/2016,1988797,8586,8650,0,0,26295.0,159833.0,160599.0,37762.0,...,157838.0,22764.0,94043.0,314605.0,95327.0,307021.0,38807.0,184536.0,37234.0,426202.0
4766,2015/2016,1988798,9817,9825,0,3,30455.0,213809.0,41927.0,40548.0,...,427438.0,35606.0,46539.0,38521.0,159594.0,37436.0,75489.0,36378.0,50047.0,31013.0


### We want to handle 2015/16 season

In [9]:
starting_date = '2015-09-01'
ending_date = '2016-01-01'

In [10]:
higher_up = player_attr_data[player_attr_data['date'] > starting_date]
player_overall_df = higher_up[higher_up['date'] <  ending_date].drop_duplicates(subset = ["player_api_id"])


## currently we are only concerned with overall_rating

player_overall_df = player_overall_df[['player_api_id', 'overall_rating']]

In [11]:
player_overall_df

Unnamed: 0,player_api_id,overall_rating
1,505942,67.0
8,155782,73.0
39,162549,66.0
65,30572,69.0
87,23780,70.0
...,...,...
183823,107281,73.0
183856,491794,58.0
183878,99031,80.0
183896,192132,64.0


## Join Match Data and Player Overall

In [12]:
experimenting_df = epl_2015_season_sliced

In [13]:
experimenting_df

Unnamed: 0,season,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_1,home_player_2,home_player_3,home_player_4,...,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11
4388,2015/2016,1987032,9825,8654,0,2,30859.0,26154.0,35606.0,46539.0,...,109897.0,35110.0,49543.0,155782.0,37169.0,575789.0,148302.0,25496.0,18506.0,192899.0
4389,2015/2016,1987033,8678,10252,0,1,30974.0,35515.0,24625.0,156013.0,...,183500.0,24208.0,161414.0,473853.0,261313.0,179410.0,182223.0,23991.0,154280.0,23264.0
4390,2015/2016,1987034,8455,10003,2,2,170323.0,31306.0,23783.0,30627.0,...,111800.0,155050.0,24948.0,102356.0,127130.0,144996.0,95955.0,157729.0,52563.0,26344.0
4391,2015/2016,1987035,8668,9817,2,2,31465.0,77690.0,263653.0,23268.0,...,213809.0,41927.0,40548.0,35712.0,41365.0,30966.0,24915.0,37411.0,71724.0,72436.0
4392,2015/2016,1987036,8197,8472,4,2,37770.0,67850.0,38899.0,23571.0,...,22964.0,26108.0,165526.0,180330.0,35443.0,25150.0,109058.0,24159.0,30348.0,42598.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4763,2015/2016,1988795,8466,8197,2,2,35496.0,160627.0,26552.0,209405.0,...,49571.0,23571.0,38899.0,43061.0,139671.0,173317.0,319300.0,214570.0,20694.0,286119.0
4764,2015/2016,1988796,10003,10194,0,1,30973.0,89185.0,155050.0,24948.0,...,34036.0,119541.0,200962.0,37194.0,23253.0,39109.0,176300.0,96643.0,110148.0,172321.0
4765,2015/2016,1988797,8586,8650,0,0,26295.0,159833.0,160599.0,37762.0,...,157838.0,22764.0,94043.0,314605.0,95327.0,307021.0,38807.0,184536.0,37234.0,426202.0
4766,2015/2016,1988798,9817,9825,0,3,30455.0,213809.0,41927.0,40548.0,...,427438.0,35606.0,46539.0,38521.0,159594.0,37436.0,75489.0,36378.0,50047.0,31013.0


In [14]:
columns_to_loop = ['home_player_1', 'home_player_2','home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',\
                 'home_player_7', 'home_player_8', 'home_player_9','home_player_10', 'home_player_11', 'away_player_1',\
                 'away_player_2', 'away_player_3', 'away_player_4', 'away_player_5','away_player_6', 'away_player_7',\
                 'away_player_8', 'away_player_9','away_player_10', 'away_player_11']

In [15]:
column_names_overall = ['home_player_1_overall', 'home_player_2_overall','home_player_3_overall', 'home_player_4_overall', 'home_player_5_overall', 'home_player_6_overall',\
                 'home_player_7_overall', 'home_player_8_overall', 'home_player_9_overall','home_player_10_overall', 'home_player_11_overall', 'away_player_1_overall',\
                 'away_player_2_overall', 'away_player_3_overall', 'away_player_4_overall', 'away_player_5_overall','away_player_6_overall', 'away_player_7_overall',\
                 'away_player_8_overall', 'away_player_9_overall','away_player_10_overall', 'away_player_11_overall']

In [16]:
experimenting_df.columns

Index(['season', 'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_team_goal', 'away_team_goal', 'home_player_1', 'home_player_2',
       'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
       'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10',
       'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3',
       'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7',
       'away_player_8', 'away_player_9', 'away_player_10', 'away_player_11'],
      dtype='object')

In [17]:
for i in range(len(column_names_overall)):
    experimenting_df = experimenting_df.merge(player_overall_df,\
                                left_on= columns_to_loop[i], right_on='player_api_id', how='left', suffixes=('_1', '_2'))\
                                .rename(columns={"overall_rating": column_names_overall[i]})

In [18]:
experimenting_df.columns

Index(['season', 'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_team_goal', 'away_team_goal', 'home_player_1', 'home_player_2',
       'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
       'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10',
       'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3',
       'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7',
       'away_player_8', 'away_player_9', 'away_player_10', 'away_player_11',
       'player_api_id_1', 'home_player_1_overall', 'player_api_id_2',
       'home_player_2_overall', 'player_api_id_1', 'home_player_3_overall',
       'player_api_id_2', 'home_player_4_overall', 'player_api_id_1',
       'home_player_5_overall', 'player_api_id_2', 'home_player_6_overall',
       'player_api_id_1', 'home_player_7_overall', 'player_api_id_2',
       'home_player_8_overall', 'player_api_id_1', 'home_player_9_overall',
       'player_api_id_2', 'home_p

### Create WIN, LOSE, DRAW labels

In [19]:
conditions = [
    (experimenting_df['home_team_goal'] > experimenting_df['away_team_goal']),
    (experimenting_df['home_team_goal'] == experimenting_df['away_team_goal']),
    (experimenting_df['home_team_goal'] < experimenting_df['away_team_goal']),
    ]

conditions_values = [3.0, 1.0, 0.0]

experimenting_df['results'] = np.select(conditions, conditions_values)


In [21]:
np.unique(experimenting_df['results'])

array([0., 1., 3.])

### Obtain Final Prepared Dataset

In [22]:
final_df_columns = ['home_player_1_overall', 'home_player_2_overall','home_player_3_overall', 'home_player_4_overall', 'home_player_5_overall', 'home_player_6_overall',\
                 'home_player_7_overall', 'home_player_8_overall', 'home_player_9_overall','home_player_10_overall', 'home_player_11_overall', 'away_player_1_overall',\
                 'away_player_2_overall', 'away_player_3_overall', 'away_player_4_overall', 'away_player_5_overall','away_player_6_overall', 'away_player_7_overall',\
                 'away_player_8_overall', 'away_player_9_overall','away_player_10_overall', 'away_player_11_overall', 'results']

In [23]:
prepared_df = experimenting_df[final_df_columns]

In [24]:
prepared_df

Unnamed: 0,home_player_1_overall,home_player_2_overall,home_player_3_overall,home_player_4_overall,home_player_5_overall,home_player_6_overall,home_player_7_overall,home_player_8_overall,home_player_9_overall,home_player_10_overall,...,away_player_3_overall,away_player_4_overall,away_player_5_overall,away_player_6_overall,away_player_7_overall,away_player_8_overall,away_player_9_overall,away_player_10_overall,away_player_11_overall,results
0,85.0,79.0,83.0,83.0,80.0,79.0,85.0,82.0,87.0,79.0,...,79.0,79.0,73.0,76.0,62.0,75.0,81.0,76.0,75.0,0.0
1,76.0,72.0,71.0,72.0,71.0,74.0,71.0,73.0,71.0,69.0,...,79.0,76.0,75.0,76.0,75.0,74.0,76.0,76.0,74.0,0.0
2,86.0,80.0,83.0,85.0,82.0,84.0,87.0,82.0,84.0,89.0,...,78.0,81.0,74.0,77.0,77.0,79.0,78.0,77.0,78.0,1.0
3,80.0,81.0,77.0,81.0,66.0,79.0,80.0,80.0,77.0,80.0,...,76.0,71.0,77.0,77.0,76.0,66.0,75.0,77.0,75.0,1.0
4,76.0,72.0,78.0,73.0,71.0,70.0,72.0,68.0,75.0,78.0,...,78.0,73.0,74.0,74.0,77.0,75.0,78.0,79.0,79.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,71.0,78.0,81.0,77.0,78.0,77.0,76.0,78.0,75.0,79.0,...,73.0,78.0,74.0,71.0,72.0,76.0,68.0,78.0,75.0,1.0
376,78.0,76.0,78.0,81.0,74.0,77.0,77.0,64.0,79.0,77.0,...,76.0,76.0,74.0,76.0,76.0,80.0,80.0,79.0,75.0,0.0
377,85.0,79.0,81.0,82.0,77.0,72.0,80.0,80.0,83.0,78.0,...,81.0,81.0,77.0,79.0,75.0,80.0,84.0,80.0,76.0,1.0
378,75.0,77.0,76.0,71.0,69.0,77.0,71.0,73.0,75.0,66.0,...,83.0,83.0,80.0,79.0,85.0,82.0,87.0,86.0,81.0,0.0


In [27]:
prepared_df = prepared_df.dropna()

In [28]:
prepared_df

Unnamed: 0,home_player_1_overall,home_player_2_overall,home_player_3_overall,home_player_4_overall,home_player_5_overall,home_player_6_overall,home_player_7_overall,home_player_8_overall,home_player_9_overall,home_player_10_overall,...,away_player_3_overall,away_player_4_overall,away_player_5_overall,away_player_6_overall,away_player_7_overall,away_player_8_overall,away_player_9_overall,away_player_10_overall,away_player_11_overall,results
0,85.0,79.0,83.0,83.0,80.0,79.0,85.0,82.0,87.0,79.0,...,79.0,79.0,73.0,76.0,62.0,75.0,81.0,76.0,75.0,0.0
1,76.0,72.0,71.0,72.0,71.0,74.0,71.0,73.0,71.0,69.0,...,79.0,76.0,75.0,76.0,75.0,74.0,76.0,76.0,74.0,0.0
2,86.0,80.0,83.0,85.0,82.0,84.0,87.0,82.0,84.0,89.0,...,78.0,81.0,74.0,77.0,77.0,79.0,78.0,77.0,78.0,1.0
3,80.0,81.0,77.0,81.0,66.0,79.0,80.0,80.0,77.0,80.0,...,76.0,71.0,77.0,77.0,76.0,66.0,75.0,77.0,75.0,1.0
4,76.0,72.0,78.0,73.0,71.0,70.0,72.0,68.0,75.0,78.0,...,78.0,73.0,74.0,74.0,77.0,75.0,78.0,79.0,79.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,71.0,78.0,81.0,77.0,78.0,77.0,76.0,78.0,75.0,79.0,...,73.0,78.0,74.0,71.0,72.0,76.0,68.0,78.0,75.0,1.0
376,78.0,76.0,78.0,81.0,74.0,77.0,77.0,64.0,79.0,77.0,...,76.0,76.0,74.0,76.0,76.0,80.0,80.0,79.0,75.0,0.0
377,85.0,79.0,81.0,82.0,77.0,72.0,80.0,80.0,83.0,78.0,...,81.0,81.0,77.0,79.0,75.0,80.0,84.0,80.0,76.0,1.0
378,75.0,77.0,76.0,71.0,69.0,77.0,71.0,73.0,75.0,66.0,...,83.0,83.0,80.0,79.0,85.0,82.0,87.0,86.0,81.0,0.0


In [29]:
np.any(np.isnan(prepared_df))

False

### Train the Model (Logistic Regression)

In [30]:
features = prepared_df.drop('results', axis=1)

In [31]:
features

Unnamed: 0,home_player_1_overall,home_player_2_overall,home_player_3_overall,home_player_4_overall,home_player_5_overall,home_player_6_overall,home_player_7_overall,home_player_8_overall,home_player_9_overall,home_player_10_overall,...,away_player_2_overall,away_player_3_overall,away_player_4_overall,away_player_5_overall,away_player_6_overall,away_player_7_overall,away_player_8_overall,away_player_9_overall,away_player_10_overall,away_player_11_overall
0,85.0,79.0,83.0,83.0,80.0,79.0,85.0,82.0,87.0,79.0,...,73.0,79.0,79.0,73.0,76.0,62.0,75.0,81.0,76.0,75.0
1,76.0,72.0,71.0,72.0,71.0,74.0,71.0,73.0,71.0,69.0,...,73.0,79.0,76.0,75.0,76.0,75.0,74.0,76.0,76.0,74.0
2,86.0,80.0,83.0,85.0,82.0,84.0,87.0,82.0,84.0,89.0,...,74.0,78.0,81.0,74.0,77.0,77.0,79.0,78.0,77.0,78.0
3,80.0,81.0,77.0,81.0,66.0,79.0,80.0,80.0,77.0,80.0,...,77.0,76.0,71.0,77.0,77.0,76.0,66.0,75.0,77.0,75.0
4,76.0,72.0,78.0,73.0,71.0,70.0,72.0,68.0,75.0,78.0,...,72.0,78.0,73.0,74.0,74.0,77.0,75.0,78.0,79.0,79.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,71.0,78.0,81.0,77.0,78.0,77.0,76.0,78.0,75.0,79.0,...,71.0,73.0,78.0,74.0,71.0,72.0,76.0,68.0,78.0,75.0
376,78.0,76.0,78.0,81.0,74.0,77.0,77.0,64.0,79.0,77.0,...,77.0,76.0,76.0,74.0,76.0,76.0,80.0,80.0,79.0,75.0
377,85.0,79.0,81.0,82.0,77.0,72.0,80.0,80.0,83.0,78.0,...,80.0,81.0,81.0,77.0,79.0,75.0,80.0,84.0,80.0,76.0
378,75.0,77.0,76.0,71.0,69.0,77.0,71.0,73.0,75.0,66.0,...,76.0,83.0,83.0,80.0,79.0,85.0,82.0,87.0,86.0,81.0


In [32]:
target = prepared_df['results']

In [35]:
logisticRegr = LogisticRegression()

knn = KNeighborsClassifier()


In [36]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=0)

In [37]:
y_train

142    1.0
278    0.0
66     3.0
246    3.0
12     0.0
      ... 
351    3.0
200    0.0
117    1.0
47     3.0
174    0.0
Name: results, Length: 264, dtype: float64

In [39]:
logisticRegr.fit(x_train, y_train)

LogisticRegression()

In [40]:
y_pred = logisticRegr.predict(x_test)

### Accuracy of Logistic Regression

In [43]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [45]:
logisticRegr.score(x_train,y_train)

0.5340909090909091

In [46]:
logisticRegr_ovr = LogisticRegression(multi_class = "ovr", solver="newton-cg")

In [47]:
logisticRegr_ovr.fit(x_train, y_train)

LogisticRegression(multi_class='ovr', solver='newton-cg')

In [48]:
logisticRegr_ovr.score(x_train, y_train)

0.5189393939393939

In [49]:
y_pred_ovr = logisticRegr_ovr.predict(x_test)

In [50]:
accuracy = accuracy_score(y_test, y_pred_ovr)

In [51]:
accuracy

0.4431818181818182

In [52]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_auc_score


### Confusion Matrix for Logisitic Regression with One vs Rest

In [53]:
report = confusion_matrix(y_test, y_pred_ovr)

In [54]:
report

array([[ 6,  3, 19],
       [ 7,  9, 12],
       [ 6,  2, 24]])

### We found that best logistic regression score is 44% training score is 52%

In [55]:
knn = KNeighborsClassifier()

In [56]:
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [57]:
y_pred_knn = knn.predict(x_test)

In [59]:
knn.score(x_train, y_train)

0.571969696969697

In [60]:
accuracy_score_knn = accuracy_score(y_test, y_pred_knn)

In [61]:
accuracy_score_knn

0.3409090909090909

### We found that best knn score is 34% training score is 57% Very prone to overfitting

In [62]:
gauss_nb = GaussianNB()

In [63]:
gauss_nb.fit(x_train, y_train)

GaussianNB()

In [64]:
gauss_nb.score(x_train, y_train)

0.5265151515151515

In [65]:
y_pred_gaussian_nb = gauss_nb.predict(x_test)

In [66]:
accuracy_gaussian_nb = accuracy_score(y_test, y_pred_gaussian_nb)

In [67]:
accuracy_gaussian_nb

0.375

### Best Naive Gaussian score is 37.5% training score is 52%

In [68]:
rand_forest = RandomForestClassifier()

In [69]:
rand_forest.fit(x_train, y_train)

RandomForestClassifier()

In [70]:
y_pred_rand_forest = rand_forest.predict(x_test)

In [71]:
accuracy_score_rand_forest = accuracy_score(y_test, y_pred_rand_forest)

In [72]:
accuracy_score_rand_forest

0.36363636363636365

### Hyperparameter Tuning for Random Forest

In [74]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [75]:
rf_random = RandomizedSearchCV(estimator = rand_forest, param_distributions = random_grid,\
                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [76]:
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [77]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 90,
 'bootstrap': True}

In [83]:
best_random = rf_random.best_estimator_

In [84]:
best_random.fit(x_train, y_train)

RandomForestClassifier(max_depth=90, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=10, n_estimators=400)

In [85]:
best_random.score(x_train, y_train)

0.8977272727272727

In [86]:
y_pred_best_rand = best_random.predict(x_test)

In [87]:
accuracy_score_best_rand_pred = accuracy_score(y_test, y_pred_best_rand)
accuracy_score_best_rand_pred

0.42045454545454547

### Grid Search and Cross Validation for hyperparameter tuning

In [88]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rand_forest, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [89]:
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [80, 90, 100, 110],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             verbose=2)

In [90]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=90, min_samples_leaf=3, min_samples_split=12,
                       n_estimators=200)

In [91]:
grid_search_rand_forest = grid_search.best_estimator_

In [92]:
grid_search_rand_forest.fit(x_train, y_train)

RandomForestClassifier(max_depth=90, min_samples_leaf=3, min_samples_split=12,
                       n_estimators=200)

In [93]:
y_pred_grid_search_rand_forest = grid_search_rand_forest.predict(x_test)

In [94]:
accuracy_score_grid_search_rand_forest = accuracy_score(y_test, y_pred_grid_search_rand_forest)
accuracy_score_grid_search_rand_forest

0.42045454545454547

### Decision tree classifier

In [95]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()

In [96]:
decision_tree.fit(x_train, y_train)

DecisionTreeClassifier()

In [97]:
decision_tree.score(x_train, y_train)

1.0

In [98]:
pred_y_decision_tree = decision_tree.predict(x_test)

In [99]:
accuracy_score_decision_tree = accuracy_score(y_test, pred_y_decision_tree)
accuracy_score_decision_tree

0.32954545454545453

### Gradient Boosting

In [100]:
from sklearn.ensemble import GradientBoostingClassifier

In [101]:
grad_boosting = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=5, random_state=0)

In [102]:
grad_boosting.fit(x_train, y_train)

GradientBoostingClassifier(learning_rate=1.0, max_depth=5, random_state=0)

In [103]:
y_pred_grad_boosting = grad_boosting.predict(x_test)

In [104]:
accuracy_score_grad_boosting = accuracy_score(y_test, y_pred_grad_boosting)
accuracy_score_grad_boosting

0.42045454545454547

### Adaboosting

In [105]:
ada_boosting = AdaBoostClassifier()

In [106]:
ada_boosting.fit(x_train, y_train)

AdaBoostClassifier()

In [107]:
y_pred_ada_boosting = ada_boosting.predict(x_test)

In [108]:
accuracy_score_ada_boosting = accuracy_score(y_test, y_pred_ada_boosting)
accuracy_score_ada_boosting

0.38636363636363635

### Prediction of a match between Best Team vs EPL Winner

In [120]:
import csv
with open('best_team_overall.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter = ' ', quotechar = '|')
    for row in reader:
        overall_data = row

In [151]:
overall_best_team = np.array([float(s) for s in overall_data[0].split(',')])
overall_best_team

array([90., 86., 84., 82., 81., 94., 89., 87., 93., 90., 88.])

In [152]:
overall_best_team

array([90., 86., 84., 82., 81., 94., 89., 87., 93., 90., 88.])

In [130]:
x_test.columns

Index(['home_player_1_overall', 'home_player_2_overall',
       'home_player_3_overall', 'home_player_4_overall',
       'home_player_5_overall', 'home_player_6_overall',
       'home_player_7_overall', 'home_player_8_overall',
       'home_player_9_overall', 'home_player_10_overall',
       'home_player_11_overall', 'away_player_1_overall',
       'away_player_2_overall', 'away_player_3_overall',
       'away_player_4_overall', 'away_player_5_overall',
       'away_player_6_overall', 'away_player_7_overall',
       'away_player_8_overall', 'away_player_9_overall',
       'away_player_10_overall', 'away_player_11_overall'],
      dtype='object')

### We simulate the result of the best team against EPL giant Chelsea

In [173]:
opponent_team_overall

array([86., 80., 78., 85., 82., 77., 87., 82., 84., 89., 85.])

In [172]:
#Chelsea Players Overall
opponent_team_overall = prepared_df.loc[121].values[11:22]

In [179]:
overall_best_team

array([90., 86., 84., 82., 81., 94., 89., 87., 93., 90., 88.])

In [174]:
custom_prediction = np.append(overall_best_team, opponent_team_overall)

In [175]:
custom_prediction

array([90., 86., 84., 82., 81., 94., 89., 87., 93., 90., 88., 86., 80.,
       78., 85., 82., 77., 87., 82., 84., 89., 85.])

In [176]:
testing_df = pd.DataFrame([custom_prediction],\
                          columns=x_test.columns)

In [177]:
testing_df

Unnamed: 0,home_player_1_overall,home_player_2_overall,home_player_3_overall,home_player_4_overall,home_player_5_overall,home_player_6_overall,home_player_7_overall,home_player_8_overall,home_player_9_overall,home_player_10_overall,...,away_player_2_overall,away_player_3_overall,away_player_4_overall,away_player_5_overall,away_player_6_overall,away_player_7_overall,away_player_8_overall,away_player_9_overall,away_player_10_overall,away_player_11_overall
0,90.0,86.0,84.0,82.0,81.0,94.0,89.0,87.0,93.0,90.0,...,80.0,78.0,85.0,82.0,77.0,87.0,82.0,84.0,89.0,85.0


In [178]:
logisticRegr.predict(testing_df)

array([3.])

In [181]:
logisticRegr.predict_proba(testing_df)

array([[0.16941542, 0.09970288, 0.7308817 ]])

### The Logistic Regression model predicts that the Best Custom Team we built will win this match against Chelsea. The model shows 16.9% probability of losing 9.9% probability of drawing and 73% probability of winning the game