In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import scale
import seaborn

In [2]:
data = pd.read_csv('final_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTGS,ATGS,HTGC,...,HTLossStreak5,ATWinStreak3,ATWinStreak5,ATLossStreak3,ATLossStreak5,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,0,2000-08-19,Charlton,Man City,4,0,H,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
1,1,2000-08-19,Chelsea,West Ham,4,2,H,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-4.0
2,2,2000-08-19,Coventry,Middlesbrough,1,3,NH,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,2.0
3,3,2000-08-19,Derby,Southampton,2,2,NH,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0
4,4,2000-08-19,Leeds,Everton,2,0,H,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-10.0


In [3]:
n_matches = data.shape[0]
n_features = data.shape[1] -1 
n_homewins = len(data[data.FTR == 'H'])
home_winrate = float(n_homewins)/n_matches * 100
print("Total number of matches: {}".format(n_matches))
print("Total number of features: {}".format(n_features))
print("Number of matches won by home team: {}".format(n_homewins))
print("Win rate of home team: {:.2f}%".format(home_winrate))
df = pd.DataFrame(data)
df = pd.DataFrame(data, columns=['FTHG','FTAG','HM1','HM2','HM3','AM1','AM2','AM3','HTGD','ATGD','FTR','DiffFormPts','DiffLP'])
X_all = df.drop(columns=['FTR'])
y_all = df['FTR']
cols = [['HTGD', 'ATGD', 'FTHG', 'FTAG', 'DiffLP']]
for col in cols:
    X_all[col] = scale(X_all[col])

Total number of matches: 6080
Total number of features: 42
Number of matches won by home team: 2816
Win rate of home team: 46.32%


In [4]:
X_all.HM1 = X_all.HM1.astype('str')
X_all.HM2 = X_all.HM2.astype('str')
X_all.HM3 = X_all.HM3.astype('str')
X_all.AM1 = X_all.AM1.astype('str')
X_all.AM2 = X_all.AM2.astype('str')
X_all.AM3 = X_all.AM3.astype('str')

def preprocess_features(X):
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)
    for col, col_data in X.iteritems():
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
        output = output.join(col_data)
    return output
   
X_all = preprocess_features(X_all)
print("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))


Processed feature columns (30 total features):
['FTHG', 'FTAG', 'HM1_D', 'HM1_L', 'HM1_M', 'HM1_W', 'HM2_D', 'HM2_L', 'HM2_M', 'HM2_W', 'HM3_D', 'HM3_L', 'HM3_M', 'HM3_W', 'AM1_D', 'AM1_L', 'AM1_M', 'AM1_W', 'AM2_D', 'AM2_L', 'AM2_M', 'AM2_W', 'AM3_D', 'AM3_L', 'AM3_M', 'AM3_W', 'HTGD', 'ATGD', 'DiffFormPts', 'DiffLP']


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,classification_report, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 6060, random_state=42, stratify=y_all)
# clf = SVC(gamma='auto')
# clf.fit(X_train, y_train)
# pred_clf = clf.predict(X_train)
# print(X_train.shape)
# print('---')
# print(y_train.shape)
# print('---')
# print(pred_clf)
# print(y_train)
# print(classification_report(y_test, pred_clf))
# print(confusion_matrix(y_test, pred_clf.T))

In [6]:
from time import time
from sklearn.metrics import f1_score,classification_report, confusion_matrix

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    print("Trained model in {:.4f} seconds".format(end - start))
    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    start = time()
    y_pred = clf.predict(features)
    end = time()
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    return f1_score(target, y_pred, pos_label="H"), sum(target == y_pred) / float(len(y_pred))

def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    train_classifier(clf, X_train, y_train)
    f1, acc = predict_labels(clf, X_train, y_train)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

In [7]:
clf_A = LogisticRegression(random_state = 42)
clf_B = SVC(random_state = 912, kernel='rbf',gamma='auto')
clf_C = xgb.XGBClassifier(seed = 82)

train_predict(clf_A, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_B, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_C, X_train, y_train, X_test, y_test)
print('')

Training a LogisticRegression using a training set size of 20. . .
Trained model in 0.0054 seconds
Made predictions in 0.0019 seconds.
F1 score and accuracy score for training set: 1.0000 , 1.0000.
Made predictions in 0.0079 seconds.
F1 score and accuracy score for test set: 0.8400 , 0.8536.

Training a SVC using a training set size of 20. . .
Trained model in 0.0028 seconds
Made predictions in 0.0022 seconds.
F1 score and accuracy score for training set: 0.9412 , 0.9500.
Made predictions in 0.0138 seconds.
F1 score and accuracy score for test set: 0.7642 , 0.8087.

Training a XGBClassifier using a training set size of 20. . .
Trained model in 0.0215 seconds
Made predictions in 0.0017 seconds.
F1 score and accuracy score for training set: 1.0000 , 1.0000.




Made predictions in 0.0148 seconds.
F1 score and accuracy score for test set: 0.7409 , 0.7517.

