# Lab 4 - NBA Game Winning Prediction by Classification
- The report is a simplified version of Data Mining Final Project.
- The raw data are team_season_all.csv, team_playoff_all.csv
- nba_preprocessed.csv is the intermediate data for feature extraction.

## Outline
1. Motivation
2. Problem Definition
3. Data Preprocessing
4. Feature Extraction (Selection & Engineering)
5. Model Training
6. NBA Game Winning Prediction

In [1]:
import numpy as np
import pandas as pd
import time
import csv
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score

# Cross Validation & Grid Search
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

## 1. Motivation:
- Predict winners of NBA 2018 playoff games

## 2. Problem Definition:
- Input: Averaged team performance of previous 5 games.
- Output: Win or lose the match

## 3. Data Preprocessing:
- Remove NaN, pair teams of games, and check the validity.
- Input: team_season_all.csv and team_playoff_all.csv
- Output: nba_preprocessed.csv
- Note: 這部分我寫成nbaDataPreprocessing.py(有一併上傳)，已經事先跑過並產生nba_preprocessed.csv這個檔案了，所以助教可以不需要再跑。

## 4. Feature Extraction
- Feature Selection & Feature Engineering
- Input: nba_preprocessed.csv
- Output: X(Attributes) and Y(Labels)

#### Function - featureEng()

In [2]:
# @param X: pandas.DataFrame
# @param featureSel: int
# @return X: pandas.DataFrame
def featureEng(X, featureSel=None):
    # Feature Engineering
    if not featureSel or featureSel == 0:
        return X
    if featureSel == 1:
        X['PTS_DIFF'] = X['PTS_A'] - X['PTS_B']
    elif featureSel == 2:
        attriToDrop = ['PTS_A', 'PTS_B']
        X = X.drop(columns=attriToDrop)
    elif featureSel == 3:
        X['PTS_DIFF'] = X['PTS_A'] - X['PTS_B']
        attriToDrop = ['PTS_A', 'PTS_B']
        X = X.drop(columns=attriToDrop)
    elif featureSel == 4:
        attriToDrop = [
            'FGM_A', 'FGA_A', '3PM_A', '3PA_A', 'FTM_A', 'FTA_A', 'OREB_A', 'DREB_A', 'PF_A', 
            'FGM_B', 'FGA_B', '3PM_B', '3PA_B', 'FTM_B', 'FTA_B', 'OREB_B', 'DREB_B', 'PF_B'
        ]
        X['PTS_DIFF'] = X['PTS_A'] - X['PTS_B']
        X['STL+BLK_A'] = X['STL_A'] + X['BLK_A']
        X['STL+BLK_B'] = X['STL_B'] + X['BLK_B']
        attriToDrop += ['PTS_A', 'PTS_B', 'STL_A', 'STL_B', 'BLK_A', 'BLK_B']
        X = X.drop(columns=attriToDrop)
    return X

#### Function - featureExtraction()

In [3]:
# @param dfFile: pandas.DataFrame ('nba_preprocessed.csv')
# @param dateStart, dateEnd: str in the format of 'YYYY-MM-DD'
# @param period: int
# @param featureSel: int
# @return X, Y: pandas.DataFrame
# featureExtraction() outputs X, Y for model training.
def featureExtraction(dfFile, dateStart='1000-01-01', dateEnd='2999-12-31', period=5, featureSel=None):
    df = pd.read_csv(dfFile)
    
    # Date selection
    df = df.loc[(df.Date_A >= dateStart) & (df.Date_A <= dateEnd), :].reset_index(drop=True)
    
    # Get label Y
    Y = df[['W/L_A']]
    Y = Y.rename(columns={'W/L_A': 'Label'})
    
    # Get averaged attributes X
    for idx, row in df.iterrows():
        df_sel = df.loc[df.Date_A <= row['Date_A'], :].reset_index(drop=True)
        
        # Process of Team_A
        gamePlayed_A = df_sel.loc[df_sel.Team_A == row['Team_A'], :]
        if len(gamePlayed_A) == 1:
            X_A = gamePlayed_A.loc[(gamePlayed_A.Team_A == row['Team_A']), :].sort_values(by=['Date_A'], ascending=False).iloc[0:1, 0:24].reset_index(drop=True)
        elif len(gamePlayed_A) < period:
            X_A = gamePlayed_A.loc[(gamePlayed_A.Team_A == row['Team_A']), :].sort_values(by=['Date_A'], ascending=False).iloc[1:len(gamePlayed_A), 0:24].reset_index(drop=True)
        else:
            X_A = gamePlayed_A.loc[(gamePlayed_A.Team_A == row['Team_A']), :].sort_values(by=['Date_A'], ascending=False).iloc[1:period+1, 0:24].reset_index(drop=True)
        
        # Process of Team_B
        gamePlayed_B = df_sel.loc[df_sel.Team_A == row['Team_B'], :]
        if len(gamePlayed_B) == 1:
            X_B = gamePlayed_B.loc[(gamePlayed_B.Team_A == row['Team_B']), :].sort_values(by=['Date_A'], ascending=False).iloc[0:1, 0:24].reset_index(drop=True)
        elif len(gamePlayed_B) < period:
            X_B = gamePlayed_B.loc[(gamePlayed_B.Team_A == row['Team_B']), :].sort_values(by=['Date_A'], ascending=False).iloc[1:len(gamePlayed_B), 0:24].reset_index(drop=True)
        else:
            X_B = gamePlayed_B.loc[(gamePlayed_B.Team_A == row['Team_B']), :].sort_values(by=['Date_A'], ascending=False).iloc[1:period+1, 0:24].reset_index(drop=True)
        
        # Drop unnecessary attributes
        colToDrop = ['Home/Away_A'] + ['Team_A', 'Date_A', 'W/L_A', 'Score_A', 'Opponent_A']
        X_A = X_A.drop(columns=colToDrop)
        X_B = X_B.drop(columns=colToDrop)
        
        # Rename X_B's columns
        X_B = X_B.rename(columns=lambda x: x[0:-2] + '_B')
        
        # Get X_single = [Home/Away_A + X_A + X_B]
        X_single = pd.DataFrame(data=pd.concat([X_A.mean(), X_B.mean()])).transpose()
        X_single = pd.concat([pd.DataFrame(data={'Home/Away_A': [row['Home/Away_A']]}), X_single], axis=1)
        
        # Concatenation dataFrames by row
        if idx == 0:
            X = X_single
        else:
            X = pd.concat([X, X_single], ignore_index=True)
        
    # Feature Engineering
    X = featureEng(X, featureSel)
        
    return X, Y

In [4]:
dfFile = 'nba_preprocessed.csv'
dateStart = '2015-08-01'
dateEnd = '2018-04-13'
period = 5
featureSel = 3
X, Y = featureExtraction(dfFile, dateStart, dateEnd, period, featureSel)

#### Attributes X
- First two rows are the same, since there are no previous games available to be averaged at the beginning of a season.
- _A means Team_A's attributes and _B means Team_B's attributes.

In [5]:
X.head()

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,FTA_B,REB_B,OREB_B,DREB_B,AST_B,STL_B,BLK_B,TOV_B,PF_B,PTS_DIFF
0,1,0.425,37.0,87.0,0.368,7.0,19.0,0.696,16.0,23.0,...,17.0,50.0,11.0,39.0,26.0,5.0,7.0,10.0,21.0,2.0
1,0,0.425,37.0,87.0,0.368,7.0,19.0,0.696,16.0,23.0,...,28.0,45.0,16.0,29.0,19.0,11.0,5.0,13.0,18.0,-3.0
2,0,0.4815,39.5,82.5,0.434,10.5,23.5,0.773,16.5,21.5,...,30.0,51.0,15.5,35.5,19.5,3.5,3.5,12.5,17.5,7.0
3,1,0.456,37.666667,83.0,0.395333,9.333333,23.0,0.759,17.333333,23.0,...,25.0,52.5,15.5,37.0,22.5,9.5,8.0,15.0,26.5,-9.5
4,1,0.43675,36.5,84.0,0.359,8.75,24.25,0.78525,17.75,22.75,...,22.8,50.6,14.2,36.4,23.0,7.8,8.4,20.2,24.6,-14.7


#### Label Y
- Y = 1 means Team_A wins and Team_B loses
- Y = 0 means Team_A loses and Team_B wins

In [6]:
Y.head()

Unnamed: 0,Label
0,1
1,1
2,0
3,1
4,1


## 5. Model Training
- Find optimized model parameters by "Cross Validation and Grid Search (CVGS)"
    - 若是parameter sweep的維度太高，會跑太久，所以這份作業中我將維度降低，實際上在Final Project中，維度設定較大，需要跑一個禮拜左右。
- Classifier candidates:
    - Logistic Regression Classification
    - XGBoost Classification
    - Random Forest Classification
    - AdaBoost Classification

#### Function - CrossValidationGridSearchNested()

In [7]:
def CrossValidationGridSearchNested(X_data, Y_data, num_trials, fold_num, est_classifcation, tuned_param, scoring):
    max_score = -1
    best_estimator = est_classifcation
    is_tuned_param_empty = (tuned_param == []) | (tuned_param == None)
    
    for i in range(num_trials):
        inner_cv = StratifiedKFold(n_splits=fold_num, random_state=i, shuffle=True)
        outer_cv = StratifiedKFold(n_splits=fold_num, random_state=i+1, shuffle=True)
        
        if(is_tuned_param_empty):
            param_score = cross_val_score(est_classifcation, X=X_data, y=Y_data, cv=outer_cv, scoring=scoring).mean()
        else:
            # Non_nested parameter search and scoring
            clf = GridSearchCV(estimator=est_classifcation, param_grid=tuned_param, cv=inner_cv, scoring=scoring)
            clf.fit(X_data, Y_data)
        
            # CV with parameter optimization
            param_score = cross_val_score(clf.best_estimator_, X=X_data, y=Y_data, cv=outer_cv, scoring=scoring).mean()
            
        if(param_score > max_score):
            max_score = param_score
            if(is_tuned_param_empty):
                best_estimator = est_classifcation
            else:
                best_estimator = clf.best_estimator_
            
        progress = (i+1)/num_trials*100
        print(f'> progress = {progress}%')
    
    return (max_score, best_estimator)

#### Logistic Regression

In [8]:
startTime = time.time()

# Model Settings
model = LogisticRegression()
tuned_parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'max_iter': [100, 200, 300, 400, 500]
}

# Number of random trials
NUM_TRIALS = 3
(max_score, logiRegrCVGS) = CrossValidationGridSearchNested(X, Y, NUM_TRIALS, 10, model, tuned_parameters, 'roc_auc')

print('Execution time =', time.time() - startTime)

> progress = 33.33333333333333%
> progress = 66.66666666666666%
> progress = 100.0%
Execution time = 188.47410702705383


#### XGBoost

In [9]:
startTime = time.time()

# Model Settings
model = XGBClassifier()
tuned_parameters = {
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.3],
    'n_estimators': [100, 200],
    'gamma': [x/10 for x in range(0, 2)]
}

# Number of random trials
NUM_TRIALS = 3
(max_score, xgbcCVGS) = CrossValidationGridSearchNested(X, Y, NUM_TRIALS, 10, model, tuned_parameters, 'roc_auc')

print('Execution time =', time.time() - startTime)

> progress = 33.33333333333333%
> progress = 66.66666666666666%
> progress = 100.0%
Execution time = 428.78781604766846


#### Random Forest

In [10]:
startTime = time.time()

# Model Settings
model = RandomForestClassifier()
tuned_parameters = {
    'n_estimators': [800, 1000],
    'criterion': ['entropy'],
    'max_depth': [None, 10]
}

# Number of random trials
NUM_TRIALS = 3
(max_score, randomForestCVGS) = CrossValidationGridSearchNested(X, Y, NUM_TRIALS, 10, model, tuned_parameters, 'roc_auc')

print('Execution time =', time.time() - startTime)

> progress = 33.33333333333333%
> progress = 66.66666666666666%
> progress = 100.0%
Execution time = 2367.319732904434


#### AdaBoost

In [11]:
startTime = time.time()

# Model Settings
model = AdaBoostClassifier()
tuned_parameters = {
    'learning_rate': [0.1, 0.3],
    'n_estimators': [50, 600, 1000],
}

# Number of random trials
NUM_TRIALS = 3
(max_score, adaBoostCVGS) = CrossValidationGridSearchNested(X, Y, NUM_TRIALS, 10, model, tuned_parameters, 'roc_auc')

print('Execution time =', time.time() - startTime)

> progress = 33.33333333333333%
> progress = 66.66666666666666%
> progress = 100.0%
Execution time = 1300.339076757431


#### Model Fitting

In [12]:
print('>> Logistic Regression ...')
logiRegr = LogisticRegression()
logiRegr.fit(X, Y)
logiRegrCVGS.fit(X, Y)

print('>> XGBoost ...')
xgbc = XGBClassifier()
xgbc.fit(X, Y)
xgbcCVGS.fit(X, Y)

print('>> Random Forest ...')
randomForest = RandomForestClassifier()
randomForest.fit(X, Y)
randomForestCVGS.fit(X, Y)

print('>> AdaBoost ...')
adaBoost = AdaBoostClassifier()
adaBoost.fit(X, Y)
adaBoostCVGS.fit(X, Y)

>> Logistic Regression ...
>> XGBoost ...
>> Random Forest ...
>> AdaBoost ...


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.1, n_estimators=1000, random_state=None)

In [13]:
modelsLUT = {
    'logiRegr': logiRegr,
    'logiRegrCVGS': logiRegrCVGS,
    'xgbc': xgbc,
    'xgbcCVGS': xgbcCVGS,
    'randomForest': randomForest,
    'randomForestCVGS': randomForestCVGS,
    'adaBoost': adaBoost,
    'adaBoostCVGS': adaBoostCVGS
}

## 6. NBA 2018 Playoff Games Winning Prediction

#### Function - attriGen()

In [14]:
# @param dfFile: pandas.DataFrame (from 'nba_preprocessed.csv')
# @param date: str in the format of 'YYYY-MM-DD'
# @param period: int (Number of previous games to be considered)
# @param Team_A, Team_B: str
# @param homeAway: int (None for played game prediction)
# @param featureSel: int
# @return X: pandas.DataFrame
def attriGen(df, date, period, Team_A, Team_B, homeAway=None, featureSel=None):
    # True Home/Away at the game day
    if homeAway is None:
        df_gameDay = df.loc[(df.Date_A == date) & (df.Team_A == Team_A) & (df.Team_B == Team_B), :].reset_index(drop=True)
        homeAway = int(df_gameDay['Home/Away_A'])
    
    # Date selections
    df = df.loc[df.Date_A < date, :].reset_index(drop=True)
    X_A = df.loc[(df.Team_A == Team_A), :].sort_values(by=['Date_A'], ascending=False).iloc[0:period, 0:24].reset_index(drop=True)
    X_B = df.loc[(df.Team_A == Team_B), :].sort_values(by=['Date_A'], ascending=False).iloc[0:period, 0:24].reset_index(drop=True)
    
    # Drop unnecessary attributes
    colToDrop = ['Home/Away_A'] + ['Team_A', 'Date_A', 'W/L_A', 'Score_A', 'Opponent_A']
    X_A = X_A.drop(columns=colToDrop)
    X_B = X_B.drop(columns=colToDrop)
    
    # Rename X_away's columns
    X_B = X_B.rename(columns=lambda x: x[0:-2] + '_B')
    
    # Get X = [Home/Away_A + X_A + X_B]
    X = pd.DataFrame(data=pd.concat([X_A.mean(), X_B.mean()])).transpose()
    X = pd.concat([pd.DataFrame(data={'Home/Away_A': [homeAway]}), X], axis=1)
    
    # Feature Engineering
    X = featureEng(X, featureSel)
    
    return X

#### Function - groundTruthGen()

In [15]:
# @param dfFile: pandas.DataFrame (from 'nba_preprocessed.csv')
# @param date: str in the format of 'YYYY-MM-DD'
# @param Team_A, Team_B: str
# @param featureSel: int
# @return X_groundTruth, Y_groundTruth: pandas.DataFrame
def groundTruthGen(df, date, Team_A, Team_B, featureSel=None):
    # Date selections
    df = df.loc[(df.Date_A == date) & (df.Team_A == Team_A) & (df.Team_B == Team_B), :].reset_index(drop=True)

    # Get label Y
    Y_groundTruth = df[['W/L_A']]
    Y_groundTruth = Y_groundTruth.rename(columns={'W/L_A': 'Label'})
    
    # Drop unnecessary attributes
    colToDrop = [
        'Team_A', 'Date_A', 'W/L_A', 'Score_A', 'Opponent_A', 
        'Team_B', 'Date_B', 'W/L_B', 'Home/Away_B', 'Score_B', 'Opponent_B'
    ]
    X_groundTruth = df.drop(columns=colToDrop)
    
    # Feature Engineering
    X_groundTruth = featureEng(X_groundTruth, featureSel)
    
    return X_groundTruth, Y_groundTruth

#### Function - gameAttriGen()

In [16]:
# @param dfFile: pandas.DataFrame ('nba_preprocessed.csv')
# @param dateStart, dateEnd: str in the format of 'YYYY-MM-DD'
# @param period: int
# @param Team_A, Team_B: str (If both are None, predict all games within the date range)
# @param featureSel: int
# @return X, Y: pandas.DataFrame
# gameAttriGen() outputs X_attri, Y_truth for game prediction.
def gameAttriGen(dfFile, dateStart, dateEnd, period=5, Team_A=None, Team_B=None, featureSel=None):
    df = pd.read_csv(dfFile)
    
    # Date selections
    df_sel = df.loc[(df.Date_A >= dateStart) & (df.Date_A <= dateEnd), :].reset_index(drop=True)
    
    # Generate df_sel which includes [date, Team_A, Team_B] columns
    if Team_A and Team_B:
        df_sel = df_sel.loc[(df_sel.Team_A == Team_A) & (df_sel.Opponent_A == Team_B), :].reset_index(drop=True)[['Date_A', 'Team_A', 'Opponent_A']]
    elif Team_A and not Team_B:
        df_sel = df_sel.loc[df_sel.Team_A == Team_A, :].reset_index(drop=True)[['Date_A', 'Team_A', 'Opponent_A']]
    elif not Team_A and Team_B:
        df_sel = df_sel.loc[df_sel.Opponent_A == Team_B, :].reset_index(drop=True)[['Date_A', 'Team_A', 'Opponent_A']]
    elif not Team_A and not Team_B:
        df_sel = df_sel[['Date_A', 'Team_A', 'Opponent_A']]
        # Delete duplicates: (Team_A vs Team_B) is the same as (Team_B vs Team_A). Remove one to avoid double count.
        df_new = pd.DataFrame(columns=['Date_A', 'Team_A', 'Opponent_A'])
        LUT = {}
        for date, x, y in zip(df_sel['Date_A'], df_sel['Team_A'], df_sel['Opponent_A']):
            if (date + x + y) in LUT:
                df_new = pd.concat([df_new, pd.DataFrame(columns=['Date_A', 'Team_A', 'Opponent_A'], data=[[date, x, y]])], ignore_index=True)
            else:
                LUT[date + x + y] = 1
                LUT[date + y + x] = 1
        df_sel = df_new
    
    # W/L prediction
    X_attri = Y_truth = None
    for date, Team_A, Team_B in zip(df_sel['Date_A'], df_sel['Team_A'], df_sel['Opponent_A']):
        X_toBePredicted = attriGen(df, date, period, Team_A, Team_B, None, featureSel)
        X_groundTruth, Y_groundTruth = groundTruthGen(df, date, Team_A, Team_B, featureSel)
        if X_attri is None and Y_truth is None:
            X_attri = X_toBePredicted
            Y_truth = Y_groundTruth
        else:
            X_attri = pd.concat([X_attri, X_toBePredicted], ignore_index=True)
            Y_truth = pd.concat([Y_truth, Y_groundTruth], ignore_index=True)
        
    return X_attri, Y_truth

#### Function - gamePrediction()

In [17]:
# @param dfFile: pandas.DataFrame ('nba_preprocessed.csv')
# @param modelsLUT: dict in the format of {'modelName': model}
# @param dateStart, dateEnd: str in the format of 'YYYY-MM-DD'
# @param period: int (Number of previous games to be considered)
# @param Team_A, Team_B: str (If both are None, predict all games within the date range)
# @param featureSel: int
# @return None
# gamePrediction() prints the predicted game W/L results.
def gamePrediction(dfFile, modelsLUT, dateStart, dateEnd, period=5, Team_A=None, Team_B=None, featureSel=None):
    X_attri, Y_truth = gameAttriGen(dfFile, dateStart, dateEnd, period, Team_A, Team_B, featureSel)
    
    resultLUT, accuLUT = {}, {}
    for model in modelsLUT:
        resultLUT[model] = modelsLUT[model].predict(X_attri)
        accuLUT[model] = accuracy_score(Y_truth, modelsLUT[model].predict(X_attri))
    
    print('---------- Prediction Accuracy ----------')
    print('featureSel =', featureSel)
    for x in accuLUT:
        print(x, '=', accuLUT[x]*100, '%')
    print('------------------------------------')

#### Prediction results of 2018 playoff games
- AdaBoost w/ cross validation and grid search has the highest accuracy.
- Random Forest and AdaBoost are improved significantly by grid search.
- Logistic Regression and XGBoost are not improved by grid search.

In [18]:
dfFile = 'nba_preprocessed.csv'
dateStart = '2018-04-14'
dateEnd = '2018-06-08'
period = 5
Team_A = None
Team_B = None
featureSel = 3

# W/L prediction
gamePrediction(dfFile, modelsLUT, dateStart, dateEnd, period, Team_A, Team_B, featureSel)

  if diff:
  if diff:
  if diff:
  if diff:


---------- Prediction Accuracy ----------
featureSel = 3
logiRegr = 71.95121951219512 %
logiRegrCVGS = 71.95121951219512 %
xgbc = 73.17073170731707 %
xgbcCVGS = 73.17073170731707 %
randomForest = 60.97560975609756 %
randomForestCVGS = 71.95121951219512 %
adaBoost = 69.51219512195121 %
adaBoostCVGS = 76.82926829268293 %
------------------------------------
