# Data Mining Final Project - NBA Game Winning Forecasting
## Game Prediction - Future Game Prediction

In [1]:
# import numpy as np
import pandas as pd
# import time
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score

## Function - featureEng()

In [2]:
# @param X: pandas.DataFrame
# @param featureSel: int
# @return X: pandas.DataFrame
def featureEng(X, featureSel=None):
    # Feature Engineering
    if not featureSel or featureSel == 0:
        return X
    if featureSel == 1:
        X['PTS_DIFF'] = X['PTS_A'] - X['PTS_B']
    elif featureSel == 2:
        attriToDrop = ['PTS_A', 'PTS_B']
        X = X.drop(columns=attriToDrop)
    elif featureSel == 3:
        X['PTS_DIFF'] = X['PTS_A'] - X['PTS_B']
        attriToDrop = ['PTS_A', 'PTS_B']
        X = X.drop(columns=attriToDrop)
    elif featureSel == 4:
        attriToDrop = [
            'FGM_A', 'FGA_A', '3PM_A', '3PA_A', 'FTM_A', 'FTA_A', 'OREB_A', 'DREB_A', 'PF_A', 
            'FGM_B', 'FGA_B', '3PM_B', '3PA_B', 'FTM_B', 'FTA_B', 'OREB_B', 'DREB_B', 'PF_B'
        ]
        X['PTS_DIFF'] = X['PTS_A'] - X['PTS_B']
        X['STL+BLK_A'] = X['STL_A'] + X['BLK_A']
        X['STL+BLK_B'] = X['STL_B'] + X['BLK_B']
        attriToDrop += ['PTS_A', 'PTS_B', 'STL_A', 'STL_B', 'BLK_A', 'BLK_B']
        X = X.drop(columns=attriToDrop)
    return X

## Function - attriGen()

In [3]:
# @param dfFile: pandas.DataFrame (from 'nba_preprocessed.csv')
# @param date: str in the format of 'YYYY-MM-DD'
# @param period: int (Number of previous games to be considered)
# @param Team_A, Team_B: str
# @param homeAway: int (None for played game prediction)
# @param featureSel: int
# @return X: pandas.DataFrame
def attriGen(df, date, period, Team_A, Team_B, homeAway=None, featureSel=None):
    # True Home/Away at the game day
    if homeAway is None:
        df_gameDay = df.loc[(df.Date_A == date) & (df.Team_A == Team_A) & (df.Team_B == Team_B), :].reset_index(drop=True)
        homeAway = int(df_gameDay['Home/Away_A'])
    
    # Date selections
    df = df.loc[df.Date_A < date, :].reset_index(drop=True)
    X_A = df.loc[(df.Team_A == Team_A), :].sort_values(by=['Date_A'], ascending=False).iloc[0:period, 0:24].reset_index(drop=True)
    X_B = df.loc[(df.Team_A == Team_B), :].sort_values(by=['Date_A'], ascending=False).iloc[0:period, 0:24].reset_index(drop=True)
    
    # Drop unnecessary attributes
    colToDrop = ['Home/Away_A'] + ['Team_A', 'Date_A', 'W/L_A', 'Score_A', 'Opponent_A']
    X_A = X_A.drop(columns=colToDrop)
    X_B = X_B.drop(columns=colToDrop)
    
    # Rename X_away's columns
    X_B = X_B.rename(columns=lambda x: x[0:-2] + '_B')
    
    # Get X = [Home/Away_A + X_A + X_B]
    X = pd.DataFrame(data=pd.concat([X_A.mean(), X_B.mean()])).transpose()
    X = pd.concat([pd.DataFrame(data={'Home/Away_A': [homeAway]}), X], axis=1)
    
    # Feature Engineering
    X = featureEng(X, featureSel)
    
    return X

## Function - futureGamePrediction()

In [4]:
# @param homeAway: int (0 if Team_A is away, 1 if Team_A is home)
def futureGamePrediction(dfFile, modelsLUT, date, period, Team_A, Team_B, homeAway, featureSel):
    df = pd.read_csv(dfFile)
    df_sel = df.loc[df.Date_A == date, :].reset_index(drop=True)

    if df_sel.empty and (homeAway is None):
        print(f'Error: Game not found and Home/Away is not defined.')
        print(f'isEmpty = {df_sel.empty}, HomeAway = {homeAway}')
        print('Force return w/o actions.')
        return None
    
    # Generate the attributes
    X_toBePredicted = attriGen(df, date, period, Team_A, Team_B, homeAway, featureSel)
    
    # Game prediction
    resultLUT = {}
    for model in modelsLUT:
        resultLUT[model] = modelsLUT[model].predict(X_toBePredicted)
    
    # Generate prediction report
    predictList = [resultLUT[x][0] for x in resultLUT]
    voteForWin = sum(predictList)
    voteForLoss = len(predictList) - voteForWin
    col = ['Date', 'Home/Away_A', 'Team_A', 'Team_B'] + list(resultLUT.keys()) + ['Vote for Win', 'Vote for Loss']
    data = [date, homeAway, Team_A, Team_B] + predictList + [voteForWin, voteForLoss] 
    df_report = pd.DataFrame([data], columns=col)
    
    return df_report

In [5]:
import pickle
mode = 1

In [6]:
# Model LUT
modelsLUT = {}

# Specify models
models = []
models.append('2016-08-01_to_2019-04-10_feature3_period5_LogiRegr')
models.append('2016-08-01_to_2019-04-10_feature3_period5_SVM')
models.append('2016-08-01_to_2019-04-10_feature3_period5_XGBoost')
models.append('2016-08-01_to_2019-04-10_feature3_period5_RandomForest')
models.append('2016-08-01_to_2019-04-10_feature3_period5_GBDT')
models.append('2016-08-01_to_2019-04-10_feature3_period5_AdaBoost')

# Import models
modelsName = {}
for model in models:
    modelsName[model.split('_')[-1]] = model

for model in modelsName:
    if mode == 0:
        with open('../model/Z_trainedModel/' + modelsName[model] + '.pkl', 'rb') as f:
            modelsLUT[model] = pickle.load(f)
    elif mode == 1:
        with open('../model/Z_trainedModel/' + modelsName[model] + '_gs.pkl', 'rb') as f:
            modelsLUT[model+'_GS'] = pickle.load(f)
    elif mode == 2:
        with open('../model/Z_trainedModel/' + modelsName[model] + '.pkl', 'rb') as f:
            modelsLUT[model] = pickle.load(f)
        with open('../model/Z_trainedModel/' + modelsName[model] + '_gs.pkl', 'rb') as f:
            modelsLUT[model+'_GS'] = pickle.load(f)
    else:
        print('Error: mode should be only 0, 1, or 2')
        print('mode = 0: Default models')
        print('mode = 1: Grid search models')
        print('mode = 2: Default + Grid Search models')
        # return None

  from numpy.core.umath_tests import inner1d


In [7]:
modelsLUT

{'LogiRegr_GS': LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=200, multi_class='ovr', n_jobs=-1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'SVM_GS': SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=True, random_state=None, shrinking=True,
   tol=0.001, verbose=False),
 'XGBoost_GS': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0.3, learning_rate=0.1, max_delta_step=0,
        max_depth=3, min_child_weight=1, missing=nan, n_estimators=100,
        n_jobs=-1, nthread=None, objective='binary:logistic',
        random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
        seed=None, silent=True, subsample=1),
 'RandomForest_GS': RandomForestClassifier(bootstrap=True, c

In [9]:
dfFile = '../crawler/nbaGamePair.csv'
date = ['2019-04-23', '2019-04-23', '2019-04-23', '2019-04-23']
period = 5
Team_A = ['MIL', 'DET', 'UTA', 'HOU']
Team_B = ['DET', 'MIL', 'HOU', 'UTA']
homeAway = [0, 1, 1, 0] # 0 if Team_A is away, 1 if Team_B is home
featureSel = 3

# W/L prediction (w.r.t Team_A: vote for Team_A's win or loss)
i = 0
for date, Team_A, Team_B, homeAway in zip(date, Team_A, Team_B, homeAway):
    df_single = futureGamePrediction(dfFile, modelsLUT, date, period, Team_A, Team_B, homeAway, featureSel)
    if i == 0:
        df_all = df_single
    else:
        df_all = pd.concat([df_all, df_single], ignore_index=True)
    i += 1

df_all

  if diff:
  if diff:
  if diff:
  if diff:


Unnamed: 0,Date,Home/Away_A,Team_A,Team_B,LogiRegr_GS,SVM_GS,XGBoost_GS,RandomForest_GS,GBDT_GS,AdaBoost_GS,Vote for Win,Vote for Loss
0,2019-04-23,0,MIL,DET,1,1,1,1,1,1,6,0
1,2019-04-23,1,DET,MIL,0,0,0,0,0,0,0,6
2,2019-04-23,1,UTA,HOU,1,1,1,1,1,1,6,0
3,2019-04-23,0,HOU,UTA,0,0,0,0,0,0,0,6
