In [1]:
import pandas as pd
import numpy as np
import datetime
import math
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

In [2]:
seasons = []
for i in range(12):
    seasons.append(pd.read_csv("Season" + str(i)+".csv"))

In [3]:
columns_to_delete = ['Unnamed: 0', 'Date', 'Home', 'Away', 'Home Coach', 'Away Coach',
       'Attendance', 'Referees', 'Linesmen', 'Goals', 'Shootouts',
       'Penalties Home', 'Penalties Away', 'Home Players Stats',
       'Away Players Stats', 'Unequal Home', 'Unequal Away', 'All goals Home',
       'All goals Away', 'All missedHome', 'All Missed Away', 'Shots Home',
       'Shots Away', 'Final Status', 'Home Score', 'Away Score', 'OT']

In [4]:
for i in range(len(seasons)):
        seasons[i] = seasons[i].drop(columns_to_delete, axis=1)

In [5]:
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.naive_bayes import GaussianNB

In [6]:
def average(p1, p2, p3, p4, p5):
    weights = [0.4, 0.275, 0.075, 0.175, 0.075]
    res = weights[0]*p1 + weights[1]*p2 + weights[2]*p3 + weights[3]*p4 + weights[4]*p5
    res = res>0.5
    ret = []
    for i in res:
        if i:
            ret.append(1)
        else:
            ret.append(0)
    return ret

We separate the test season into several parts and train model for each part. It is neccessary because 80% win rate in the beginning of a season and in the end have different meanings. In the first case, we have a good start, and in the second one we have a team who wins everyone

In [11]:
index_to_test = 0 
games_window = 30
metrics = []
CV_results = []
for index_to_test in tqdm(range(len(seasons))):
        
        index_cut = 600
        seasons_cut = []
        for i in seasons:
            seasons_cut.append(i[index_cut:])

        training_seasons = []
        for i in range(len(seasons_cut)):
            if i!=index_to_test:
                training_seasons.append(seasons_cut[i])

        train = pd.concat(training_seasons)
        test = seasons_cut[index_to_test]
        columns = seasons[0].columns
        x_columns = []
        for j in columns:
            if (j!='Result'):
                x_columns.append(j)
                

        #columns = [ 'Score Ratio',"Home Power Rating", "Away Power Rating", 'Result']
        #x_columns = ['Score Ratio',"Home Power Rating", "Away Power Rating"]


        y_column = 'Result'
        scaler = StandardScaler()
        train_data = train[columns]
        train_data = train[train['Score Ratio'] != math.inf]
        test_data = test[columns]
        test_data = test[test['Score Ratio'] != math.inf]
        x_train = scaler.fit_transform(train_data[x_columns])
        x_test = scaler.fit_transform(test_data[x_columns])
        y_train = train_data[y_column]
        y_test = test_data[y_column]

        clf = LogisticRegression().fit(x_train, y_train)
        result_clf = clf.predict(x_test)

        svm = SVC(kernel='linear').fit(x_train, y_train)
        result_svm = svm.predict(x_test)

        tree = DecisionTreeClassifier(criterion='entropy',max_depth=5).fit(x_train, y_train)
        result_tree = tree.predict(x_test)


        RF = RandomForestClassifier(max_depth=5, n_estimators=1000)
        RF.fit(x_train, y_train)
        result_forest = RF.predict(x_test)


        knn = KNeighborsClassifier(n_neighbors=8, p=2, metric='minkowski').fit(x_train, y_train)
        result_kNN = knn.predict(x_test)
        assamble_result = average(result_clf, result_svm, result_tree, result_forest, result_kNN)
        
        
        gnb = GaussianNB()
        NBC_result = gnb.fit(x_train, y_train).predict(x_test)

        metrics.append({"Cut": index_cut, "CLF Bets": result_clf.sum(), "CLF Precision": precision_score(y_test, result_clf),
                       "SVM Bets" :  result_svm.sum(), "SVM Precision": precision_score(y_test, result_svm),
                       "Tree Bets" : result_tree.sum(),  "Tree Precision": precision_score(y_test, result_tree),
                       "kNN Bets" : result_kNN.sum(),  "kNN Precision":precision_score(y_test, result_kNN),
                       "Forest Bets" : result_forest.sum(),  "Forest Precision":precision_score(y_test, result_forest),
                       "Assemble Bets" : np.array(assamble_result).sum(),  "Assemble Precision":precision_score(y_test, assamble_result),
                       "NBC Bets": np.array(NBC_result).sum(), "NBC Precision":precision_score(y_test, NBC_result)}) 


        total_result = pd.DataFrame(metrics)
        CV_results_current = []
        algorithms = ['CLF', 'SVM', 'Tree', 'kNN', 'Forest', 'Assemble', 'NBC']
        for i in algorithms:
            CV_results_current.append((total_result[i + ' Bets']*total_result[i +' Precision']).sum()/total_result[i+' Bets'].sum())
        CV_results.append(CV_results_current)   
            
CV_results_DF = pd.DataFrame(CV_results, columns = algorithms, index = range(2009, 2021))
CV_results_DF

100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:42<00:00,  3.54s/it]


Unnamed: 0,CLF,SVM,Tree,kNN,Forest,Assemble,NBC
2009,0.62963,0.636364,0.576923,0.666667,0.607143,0.653846,0.666667
2010,0.666667,0.658537,0.625,0.666667,0.62963,0.680851,0.625
2011,0.678571,0.673469,0.636364,0.672414,0.650794,0.690909,0.636364
2012,0.693548,0.690909,0.625,0.66129,0.652174,0.704918,0.630137
2013,0.670455,0.671053,0.62,0.611111,0.625,0.686747,0.62963
2014,0.625,0.619048,0.598361,0.557143,0.594771,0.637037,0.614943
2015,0.594828,0.606061,0.572727,0.539171,0.592742,0.608295,0.583942
2016,0.580745,0.594891,0.553398,0.508711,0.583333,0.590164,0.556136
2017,0.570071,0.588068,0.555556,0.508152,0.578341,0.579602,0.543967
2018,0.569038,0.587654,0.561845,0.511166,0.579918,0.577681,0.542495


SVM is the best model