In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import math
import os
from IPython.display import clear_output

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

import pickle

In [None]:
ignore_cols = ['date', 'gameId', 'home_teamID', 'visitor_teamID']


currDir = os.getcwd()

threeAvgDF = pd.read_csv(currDir+'\\data\\threeAvgsWeighted.csv').drop(ignore_cols,axis=1)

print('Total (Rows, Columns):', threeAvgDF.shape)
x = threeAvgDF[threeAvgDF.columns[:-1]]
y = list(threeAvgDF['hWin'])

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.20,random_state=123,shuffle=True)

print('Training (Rows, Columns):', x_train.shape) # 80% of the data to be trained on
print('Testing (Rows, Columns):', x_test.shape) # 20% of the data to be tested on

In [None]:
print('Distribution of training data\n\nHome Wins:', y_train.count(1), '| Away Wins:',y_train.count(0))

In [None]:
def shuffleTeams(x_df, y_df, percentToShuffle):

    toShuffle = random.sample(population=range(len(x_df)), k=math.floor(percentToShuffle*len(x_df)))
    
    homeCols = x_df.columns[2:25]
    visitorCols = x_df.columns[25:-1]
    
    data = {}
    ret_y = []
    
    for col in x_df.columns:
        data.update({col : []})
    
    dataLen = len(list(data.keys()))-1
    visitorStart = 22
    
    z=0
    for i, row in x_df.iterrows():
        
        if i in toShuffle:
            columns = list(x_df.columns)
            
            for iCol in range(22):
                homeCol = columns[iCol]
                visitorCol = columns[visitorStart+iCol]
                
                data[homeCol].append(row[visitorCol])
                data[visitorCol].append(row[homeCol])
            if y_df[z]:
                ret_y.append(0)
            else:
                ret_y.append(1)
        else:
            for col in x_df.columns:
                data[col].append(row[col])
            ret_y.append(y_df[z])
        z+=1
        
    retVal = pd.DataFrame(data)
        
    return retVal, ret_y

homeWins = y_train.count(1)
awayWins = y_train.count(0)


while homeWins/(homeWins+awayWins) < 0.52 or homeWins/(homeWins+awayWins) > 0.55:
    
    print(f'Home Wins: {homeWins}, Away Wins: {awayWins}')
    clear_output(wait=True)
    x_train, y_train = shuffleTeams(x_train, y_train, 0.1)
    
    homeWins = y_train.count(1)
    awayWins = y_train.count(0)


print(f'Home Wins: {homeWins}, Away Wins: {awayWins}')
print(f'Percentage of Home Wins: {homeWins/(homeWins+awayWins)}')
print(f'Shape: {x_train.shape}')

### Standardize Input
This aligns the data to be in the format of mean=0 and standard deviation=1

In [None]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

### Model Parameters
Unfortunately, these are arbitrary for now, they can be optimized later with more work.

In [None]:
EPOCHS = 150
BATCH_SIZE = 32
LEARNING_RATE = 2e-5

### Data Loaders

In [None]:
# Train data

class trainData(Dataset):
    
    def __init__(self, x_data, y_data):
        self.x_data = x_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return len(self.x_data)
    
train_data = trainData(torch.FloatTensor(x_train), torch.FloatTensor(y_train))

# Test data

class testData(Dataset):
    
    def __init__(self, x_data):
        self.x_data = x_data
        
    def __getitem__(self, index):
        return self.x_data[index]
    
    def __len__(self):
        return len(self.x_data)
    
test_data = testData(torch.FloatTensor(x_test))

##### Create Data Loaders

In [None]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

test_loader = DataLoader(dataset=test_data, batch_size=1)

### Define Neural Net Architecture
This Neural Network has one hidden layer with 128 nodes, and produces noise in 30% of the data.

In [None]:
class breakingBasketball(nn.Module):
    def __init__(self):
        super(breakingBasketball, self).__init__()
        
        # Number of input features is 44
        self.layer_1 = nn.Linear(44, 88)
        self.layer_out = nn.Linear(88, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.4)
        self.batchnorm1 = nn.BatchNorm1d(88)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = breakingBasketball()
model.to(device)

print(model)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

### Train the model

In [None]:
def binary_acc(y_pred, y_test):

    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    return acc

In [None]:
lossTracker = []
accTracker = []

In [None]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()

        y_pred = model(x_batch)

        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    lossTracker.append(epoch_loss/len(train_loader))
    accTracker.append(epoch_acc/len(train_loader))

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

In [None]:
plt.plot(lossTracker, linewidth=1, c='dodgerblue')
plt.xlabel('Epoch Number', fontsize=12)
plt.ylabel('Loss Score', fontsize=12)
plt.savefig('LossHomeAdv.png')
plt.show()

In [None]:
plt.plot(accTracker, linewidth=1, c='dodgerblue')
plt.xlabel('Epoch Number', fontsize=12)
plt.ylabel('Accuracy (%)', fontsize=12)
plt.savefig('AccHomeAdv.png')
plt.show()

In [None]:
y_pred_list = []

model.eval()
with torch.no_grad():
    for x_batch in test_loader:
        x_batch = x_batch.to(device)
        y_test_pred = model(x_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_list.append(round(y_test_pred.numpy()[0][0]))

In [None]:
visitorPredWins = y_pred_list.count(0)
homePredWins = y_pred_list.count(1)

labels = 'Home Predicted Wins', 'Visitor Predicted Wins'
sizes = [homePredWins, visitorPredWins]


plt.figure(figsize=(10,6))
plt.bar(x=labels, height=sizes, color='dodgerblue')
plt.xticks(fontsize=20)
plt.yticks(fontsize=18)
plt.ylim(0,700)
plt.savefig('predHomeAdvantageDistribution.png')
plt.show()

In [None]:
matrix = confusion_matrix(y_test, y_pred_list)
true_negatives, false_positives, false_negatives, true_positives = matrix[0][0], matrix[0][1], matrix[1][0], matrix[1][1]
print('True Negatives:', true_negatives)
print('False Positives:', false_positives)
print('False Negatives:', false_negatives)
print('True Positives:', true_positives)

In [None]:
test_acc = round(100*(true_negatives+true_positives)/(true_positives+true_negatives+false_positives+false_negatives),3)
test_acc

In [None]:
labels = 'Correctly Predicted', 'Incorrectly Predicted'
sizes = [test_acc/100, 1-(test_acc/100)]
explode = (0.05, 0.05)

fig, ax = plt.subplots(figsize=[10,6])
ax.pie(sizes, explode=explode, autopct='%2.2f%%',
        shadow=True, startangle=90)
ax.axis('equal')

plt.savefig('homeBiasPieChart.png')
plt.show()

In [None]:
testTracker = [test_acc for x in range(len(accTracker))]
plt.plot(accTracker, linewidth=1, c='dodgerblue')
plt.plot(testTracker, linewidth=1.5, c='red')
plt.xlabel('Epoch Number')
plt.ylabel('Accuracy (%)')
plt.show()

In [None]:
print(classification_report(y_test, y_pred_list))

In [None]:
filename = 'bestModelNoHomeTeamAdv.pkl'
pickle.dump(model, open(filename, 'wb'))

In [None]:
currDir = os.getcwd()
teamsDF = pd.read_csv(currDir+'\\data\\teams.csv')

teamsDict = {}
for i in range(len(teamsDF)):
    teamsDict.update({teamsDF['code'][i] : teamsDF['name'][i]})

In [None]:
def predictGame(model, gameStats, hasOutcome):
    team1, team2 = gameStats['home_teamID'], gameStats['visitor_teamID']
    hTeam, vTeam = teamsDict[team1], teamsDict[team2]
    
    ignore_cols = ['date', 'gameId', 'home_teamID', 'visitor_teamID']
    if hasOutcome:
        outcome = gameStats[-1]
        ignore_cols.append('hWin')
        
    gameStats = gameStats.drop(ignore_cols)
    gameStats = torch.FloatTensor([gameStats])
    
    y_test_pred = model(gameStats)
    y_test_pred = torch.sigmoid(y_test_pred).item()
    prediction = round(y_test_pred)
    
    if prediction:
        print(hTeam, 'will beat', vTeam, '\n')
        if hasOutcome and outcome == prediction:
            #print('And the model was right!\n')
            return 1
        else:
            #print('But the model was wrong\n')
            return 0
    else:
        print(vTeam, 'will beat', hTeam, '\n')
        if hasOutcome and outcome == prediction:
            #print('And the model was right!\n')
            return 1
        else:
            #print('But the model was wrong\n')
            return 0

In [None]:
currDir = os.getcwd()
gamesDF = pd.read_csv(currDir+'\\data\\threeAvgsWeighted.csv')

In [None]:
homeAdv = int(input('Type 1 if you want home team advantage, 0 if not: '))

bestModel = 'bestModelNoHomeTeamAdv.pkl'
if homeAdv:
    bestModel = 'bestModelHomeTeamAdv.pkl'
    
loaded_model = pickle.load(open(bestModel, 'rb'))
loaded_model.eval()

In [None]:
twenty15 = np.logical_and(gamesDF['gameId'] >= 41500400, gamesDF['gameId'] < 41600000)
twenty16 = np.logical_and(gamesDF['gameId'] >= 41600400, gamesDF['gameId'] < 41700000)
fifteenSixteen = np.logical_or(twenty15, twenty16)

twenty17 = np.logical_and(gamesDF['gameId'] >= 41700400, gamesDF['gameId'] < 41800000)
twenty18 = np.logical_and(gamesDF['gameId'] >= 41800400, gamesDF['gameId'] < 41900000)
seventeenEighteen = np.logical_or(twenty17, twenty18)

firstFour = np.logical_or(fifteenSixteen, seventeenEighteen)

playoffGames = np.logical_or(firstFour, gamesDF['gameId'] >= 41900400)

finals = gamesDF[playoffGames]

predictYear = []
for i in range(len(finals)):
    thisYear = int(finals.iloc[i]['date']/10000)
    if thisYear not in predictYear:
        predictYear.append(thisYear)
        print(thisYear, 'finals\n')
    predictGame(loaded_model, finals.iloc[i], True)