In [8]:
import pandas as pd
#produces a prediction model in the form of an ensemble of weak prediction models, typically decision tree
import xgboost as xgb
#the outcome (dependent variable) has only a limited number of possible values. 
#Logistic Regression is used when response variable is categorical in nature.
from sklearn.linear_model import LogisticRegression
#A random forest is a meta estimator that fits a number of decision tree classifiers 
#on various sub-samples of the dataset and use averaging to improve the predictive 
#accuracy and control over-fitting.
from sklearn.ensemble import RandomForestClassifier
#a discriminative classifier formally defined by a separating hyperplane.
from sklearn.svm import SVC
#displayd data
from IPython.display import display
%matplotlib inline

In [16]:
data = pd.read_csv('final_dataset_1.csv')

# Preview data.
display(data.head())

#Full Time Result (H=Home Win, D=Draw, A=Away Win)
#HTGD - Home team goal difference
#ATGD - away team goal difference
#HTP - Home team points
#ATP - Away team points
#DiffFormPts Diff in points
#DiffLP - Differnece in last years prediction

#Input - 12 other features (fouls, shots, goals, misses,corners, red card, yellow cards)
#Output - Full Time Result (H=Home Win, D=Draw, A=Away Win)

Unnamed: 0,HomeTeam,AwayTeam,FTR,HTP,ATP,HM1,HM2,HM3,AM1,AM2,AM3,HTGD,ATGD,DiffFormPts,DiffLP
0,Arsenal,Leicester,H,0.0,0.0,M,M,M,M,M,M,0.0,0.0,0.0,-6
1,Chelsea,Sunderland,H,0.0,0.0,M,M,M,M,M,M,0.0,0.0,0.0,-2
2,Coventry,Southampton,NH,0.0,0.0,M,M,M,M,M,M,0.0,0.0,0.0,-1
3,Leeds,Derby,NH,0.0,0.0,M,M,M,M,M,M,0.0,0.0,0.0,-13
4,Middlesbrough,Bradford,NH,0.0,0.0,M,M,M,M,M,M,0.0,0.0,0.0,-5


# Data Exploration

In [7]:
#what is the win rate for the home team?

# Total number of matches.
n_matches = data.shape[0]

# Calculate number of features. -1 because we are saving one as the target variable (win/lose/draw)
n_features = data.shape[1] - 1

# Calculate matches won by home team.
n_homewins = len(data[data.FTR == 'H'])

# Calculate win rate for home team.
win_rate = (float(n_homewins) / (n_matches)) * 100

# Print the results
print("Total number of matches: {}".format(n_matches)) 
print ("Number of features: {}".format(n_features))
print ("Number of matches won by home team: {}".format(n_homewins))
print ("Win rate of home team: {:.2f}%".format(win_rate))

Total number of matches: 7600
Number of features: 42
Number of matches won by home team: 3544
Win rate of home team: 46.63%


# Preparation of data

In [20]:
# Separate into feature set and target variable
#FTR = Full Time Result (H=Home Win, D=Draw, A=Away Win)
X_all = data.drop(['FTR','HomeTeam','AwayTeam'],1)
y_all = data['FTR']

# Standardising the data.
from sklearn.preprocessing import scale

#Center to the mean and component wise scale to unit variance.
cols = [['HTGD','ATGD','HTP','ATP','DiffLP']]
for col in cols:
    X_all[col] = scale(X_all[col])

In [21]:
#last 3 wins for both sides
X_all.HM1 = X_all.HM1.astype('str')
X_all.HM2 = X_all.HM2.astype('str')
X_all.HM3 = X_all.HM3.astype('str')
X_all.AM1 = X_all.AM1.astype('str')
X_all.AM2 = X_all.AM2.astype('str')
X_all.AM3 = X_all.AM3.astype('str')

#we want continous vars that are integers for our input data, so lets remove any categorical vars
def preprocess_features(X):
    ''' Preprocesses the football data and converts catagorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print ("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))

Processed feature columns (30 total features):
['HTP', 'ATP', 'HM1_D', 'HM1_L', 'HM1_M', 'HM1_W', 'HM2_D', 'HM2_L', 'HM2_M', 'HM2_W', 'HM3_D', 'HM3_L', 'HM3_M', 'HM3_W', 'AM1_D', 'AM1_L', 'AM1_M', 'AM1_W', 'AM2_D', 'AM2_L', 'AM2_M', 'AM2_W', 'AM3_D', 'AM3_L', 'AM3_M', 'AM3_W', 'HTGD', 'ATGD', 'DiffFormPts', 'DiffLP']


In [46]:
X_all=X_all.drop(columns=['HM1_M','HM2_M','HM3_M','AM1_M','AM2_M','AM3_M',])

In [58]:
# Show the feature information by printing the first five rows
print ("\nFeature values:")
display(X_all.head(31))


Feature values:


Unnamed: 0,HTP,ATP,HM1_D,HM1_L,HM1_W,HM2_D,HM2_L,HM2_W,HM3_D,HM3_L,...,AM2_D,AM2_L,AM2_W,AM3_D,AM3_L,AM3_W,HTGD,ATGD,DiffFormPts,DiffLP
0,-2.259584,-2.312939,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.013707,-0.018471,0.0,-0.7159
1,-2.259584,-2.312939,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.013707,-0.018471,0.0,-0.238633
2,-2.259584,-2.312939,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.013707,-0.018471,0.0,-0.119317
3,-2.259584,-2.312939,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.013707,-0.018471,0.0,-1.551117
4,-2.259584,-2.312939,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.013707,-0.018471,0.0,-0.596584
5,-2.259584,-2.312939,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.013707,-0.018471,0.0,0.596584
6,-2.259584,-2.312939,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.013707,-0.018471,0.0,1.670434
7,-2.259584,-2.312939,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.013707,-0.018471,0.0,0.238633
8,-2.259584,-2.312939,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.013707,-0.018471,0.0,-0.119317
9,-2.259584,-2.312939,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.013707,-0.018471,0.0,1.4318


In [59]:
from sklearn.model_selection import train_test_split

# Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, 
                                                    test_size = 25,
                                                    random_state = 2,
                                                    stratify = y_all)

In [60]:
from time import time
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print ("Trained model in {:.4f} seconds".format(end - start))
    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print ("Made predictions in {:.4f} seconds.".format(end - start))
    
    return f1_score(target, y_pred, pos_label='H'), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print ("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print (f1, acc)
    print ("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print ("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

In [61]:
clf_A = LogisticRegression(random_state = 42)
clf_B = SVC(random_state = 912, kernel='rbf')
clf_C = xgb.XGBClassifier(seed = 52)
clf_D = RandomForestClassifier(n_estimators = 100)

train_predict(clf_A, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_B, X_train, y_train, X_test, y_test)
print ('')
train_predict(clf_C, X_train, y_train, X_test, y_test)
print ('')
train_predict(clf_D, X_train, y_train, X_test, y_test)
print ('')

Training a LogisticRegression using a training set size of 7575. . .
Trained model in 0.1143 seconds
Made predictions in 0.0000 seconds.
0.6577493457400407 0.6892409240924092
F1 score and accuracy score for training set: 0.6577 , 0.6892.
Made predictions in 0.0036 seconds.
F1 score and accuracy score for test set: 0.5714 , 0.6400.

Training a SVC using a training set size of 7575. . .
Trained model in 5.4952 seconds
Made predictions in 3.2287 seconds.
0.6866238317757009 0.7166996699669967
F1 score and accuracy score for training set: 0.6866 , 0.7167.
Made predictions in 0.0168 seconds.
F1 score and accuracy score for test set: 0.6364 , 0.6800.

Training a XGBClassifier using a training set size of 7575. . .
Trained model in 1.8225 seconds
Made predictions in 0.0605 seconds.
0.8956509257930242 0.904026402640264
F1 score and accuracy score for training set: 0.8957 , 0.9040.
Made predictions in 0.0049 seconds.
F1 score and accuracy score for test set: 0.6667 , 0.7200.

Training a RandomFo

# Clearly RandomForestClassifier seems like the best model as it has the highest F1 score and accuracy score on the test set.