In [None]:
# Author: Hari Raval
# Course: COS 424
# Final Project 

In [None]:
# Import all necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
import pickle

In [None]:
# read in all csv files
game_details_df = pd.read_csv("/Users/HariRaval/Desktop/COS-424/Final-Project/NBA_Data/games_details.csv")
games_df = pd.read_csv("/Users/HariRaval/Desktop/COS-424/Final-Project/NBA_Data/games.csv")
players_df = pd.read_csv("/Users/HariRaval/Desktop/COS-424/Final-Project/NBA_Data/players.csv")
ranking_df = pd.read_csv("/Users/HariRaval/Desktop/COS-424/Final-Project/NBA_Data/ranking.csv")
teams_df = pd.read_csv("/Users/HariRaval/Desktop/COS-424/Final-Project/NBA_Data/teams.csv")
# games_df = pd.read_csv("/Users/HariRaval/Desktop/COS-424/Final-Project/NBA_Data/game2.csv")


In [None]:
# clean up the game data and extract the outcome values 

games_df = games_df.drop(["GAME_STATUS_TEXT", "GAME_DATE_EST"], axis = 1)
games_df = games_df.dropna()
game_result = games_df["HOME_TEAM_WINS"]
games_df = games_df.drop(["HOME_TEAM_WINS"], axis = 1)

game_details_df = game_details_df.drop(["TEAM_ABBREVIATION", "TEAM_CITY", "PLAYER_NAME", "START_POSITION","COMMENT"], axis = 1)
game_details_df = game_details_df.dropna()
game_details_result_df = game_details_df["PLUS_MINUS"]
game_details_df = game_details_df.drop(["PLUS_MINUS"], axis = 1)

# convert the minutes to a decimal numeric value
min_col = game_details_df["MIN"]
updated_min_col = []

for minute in min_col:
    new_min = minute.replace(":", ".")
    new_min = float(new_min)
    updated_min_col.append(new_min)

game_details_df["MIN"] = updated_min_col


In [None]:
# save the game ID as the index for looking up games 
games_df = games_df.set_index("GAME_ID", drop = True)
games_df.index.name = 'GAME_ID'

# save the player id as the index for looking up players
game_details_df = game_details_df.set_index("PLAYER_ID", drop = True)
game_details_df.index.name = 'PLAYER_ID'

In [None]:
# Model 1A: Gaussian Naive Bayes (predicting game outcome)

def gaussian_naive_bayes_games_outcome(games_X_train, games_y_train, games_X_test, games_y_test):
    # Gaussian naive bayes with no hyper parameter tuning 
    naive_bayes_gaussian = GaussianNB()
    naive_bayes_gaussian_fit = naive_bayes_gaussian.fit(games_X_train, games_y_train)
    predicted_values = naive_bayes_gaussian.predict(games_X_test)

    score = accuracy_score(games_y_test, predicted_values)
    # F1 score source: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
    f1 = f1_score(games_y_test, predicted_values, zero_division = 1, pos_label = 1)
    print("Gaussian Naive Bayes Accuracy (no hyper tuning): ", round(score * 100, 4), "%")
    print("Gaussian Naive Bayes F1 Score (no hyper tuning): ", f1)
    print("-------------------------------------")
    
    return (score, naive_bayes_gaussian, predicted_values)

In [None]:
# Model 2A: Linear SVM (predicting game outcome)

def svm_game_outcomes(games_X_train, games_y_train, games_X_test, games_y_test):
    # linear svm with no hyper parameter tuning
    svm_lin = LinearSVC()
    svm_fit = svm_lin.fit(games_X_train, games_y_train)
    predicted_values = svm_lin.predict(games_X_test)

    score = accuracy_score(games_y_test, predicted_values)
    # F1 score source: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
    f1 = f1_score(games_y_test, predicted_values, zero_division = 1, pos_label = 1)
    print("SVM (With Linear Kernel) Accuracy (no hyper tuning): ", round(score * 100, 4), "%")
    print("SVM (with Linear Kernel) F1 Score (no hyper tuning): ", f1 )

    # linear svm with hyper parameter tuning
    parameters = {'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], 'loss' : ['hinge','squared_hinge']}
    svm_hyper_tuned = GridSearchCV(svm_lin, parameters)
    svm_hyper_tuned_fit = svm_hyper_tuned.fit(games_X_train, games_y_train)
    predicted_values = svm_hyper_tuned.predict(games_X_test)

    score = accuracy_score(games_y_test, predicted_values)
    # F1 score source: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
    f1 = f1_score(games_y_test, predicted_values, zero_division = 1, pos_label = 1)
    print("SVM (With Linear Kernel) Accuracy (hyper tuning): ", round(score * 100, 4), "%")
    print("SVM (with Linear Kernel) F1 Score (hyper tuning): ", f1 )
    print("Hypertuned alpha value: ", svm_hyper_tuned.best_estimator_.C)
    print("Hypertuned loss value: ", svm_hyper_tuned.best_estimator_.loss)
    print("-------------------------------------")
    
    return (score, svm_hyper_tuned, predicted_values)

In [None]:
# Model 3A: Logistic Regression (predicting game outcome)

def logistic_regression_game_outcomes(games_X_train, games_y_train, games_X_test, games_y_test):

    # logistic regression with no hyper parameter tuning
    logistic_regression = LogisticRegression(solver = 'liblinear')
    logistic_regression_fit = logistic_regression.fit(games_X_train, games_y_train)
    predicted_values = logistic_regression.predict(games_X_test)

    score = accuracy_score(games_y_test, predicted_values)
    # F1 score source: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
    f1 = f1_score(games_y_test, predicted_values, zero_division = 1, pos_label = 1)
    print("Logistic Regression Accuracy (no hyper tuning): ", round(score * 100, 4), "%")
    print("Logistic Regression F1 Score (no hyper tuning): ", f1 )

    # logistic regression with hyper parameter tuning
    parameters = {'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]}
    logistic_regression_hyper_tuned = GridSearchCV(logistic_regression, parameters)
    logistic_regression_hyper_tuned_fit = logistic_regression_hyper_tuned.fit(games_X_train, games_y_train)
    predicted_values = logistic_regression_hyper_tuned.predict(games_X_test)

    score = accuracy_score(games_y_test, predicted_values)
    # F1 score source: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
    f1 = f1_score(games_y_test, predicted_values, zero_division = 1, pos_label = 1)
    print("Logistic Regression Accuracy (hyper tuning): ", round(score * 100, 4), "%")
    print("Logistic Regression F1 Score (hyper tuning): ", f1)
    print("Hypertuned C value: ", logistic_regression_hyper_tuned.best_estimator_.C)
    print("-------------------------------------")
    
    return (score, logistic_regression_hyper_tuned, predicted_values)

In [None]:
# Model 4A: Random Forest (predicting game outcome)

def random_forest_game_outcomes(games_X_train, games_y_train, games_X_test, games_y_test):
    
    # random forest with no hyper parameter tuning
    random_forest_classifier = RandomForestClassifier()
    random_forest_classifier_fit = random_forest_classifier.fit(games_X_train, games_y_train)
    predicted_values = random_forest_classifier.predict(games_X_test)

    score = accuracy_score(games_y_test, predicted_values)
    # F1 score source: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
    f1 = f1_score(games_y_test, predicted_values, zero_division = 1, pos_label = 1)
    print("Random Forest Accuracy (no hyper tuning): ", round(score * 100, 4), "%")
    print("Random Forest F1 Score (no hyper tuning): ", f1 )

    # random forest with hyper parameter tuning
    parameters = {'n_estimators': [100, 150, 200], 'max_depth' : [None, 10, 20, 30] }
    random_forest_classifier_hyper_tuned = GridSearchCV(random_forest_classifier, parameters)
    random_forest_classifier_hyper_tuned_fit = random_forest_classifier_hyper_tuned.fit(games_X_train, games_y_train)
    predicted_values = random_forest_classifier_hyper_tuned.predict(games_X_test)

    score = accuracy_score(games_y_test, predicted_values)
    # F1 score source: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
    f1 = f1_score(games_y_test, predicted_values, zero_division = 1, pos_label = 1)
    print("Random Forest Accuracy (hyper tuning): ", round(score * 100, 4), "%")
    print("Random Forest F1 Score (hyper tuning): ", f1)
    print("Hypertuned n_estimators value: ", random_forest_classifier_hyper_tuned.best_estimator_.n_estimators)
    print("Hypertuned max_depth value: ", random_forest_classifier_hyper_tuned.best_estimator_.max_depth)
    print("-------------------------------------")
    
    return (score,random_forest_classifier_hyper_tuned, predicted_values)

In [None]:
# Model 1B: Plain Linear Regression (predicting plus/minus outcome)

def linear_regression_plus_minus(game_details_X_train, game_details_y_train, game_details_X_test, game_details_y_test):
    linear_reg = linear_model.LinearRegression()
    linear_reg.fit(game_details_X_train, game_details_y_train)
    predictions = linear_reg.predict(game_details_X_test)

    print("Linear Regression R-Squared Value (no hyper tuning): ", r2_score(game_details_y_test, predictions))
    print("Linear Regerssion Mean Squared Error Loss (no hyper tuning): ",mean_squared_error(game_details_y_test, predictions))
    print("Linear Regression Mean Absolute Error: (no hyper tuning) ",mean_absolute_error(game_details_y_test, predictions))
    print("-------------------------------------")
    
    error = mean_absolute_error(game_details_y_test, predictions)
    
    return (error, linear_reg, predictions)

In [None]:
# Model 2B: Lasso Regression (predicting plus/minus outcome)

def lasso_plus_minus(game_details_X_train, game_details_y_train, game_details_X_test, game_details_y_test):
    # lasso regression with no hyper tuning
    lasso = linear_model.Lasso()
    lasso.fit(game_details_X_train, game_details_y_train)
    predictions = lasso.predict(game_details_X_test)
    print("Lasso R-Squared Value (no hyper tuning): ", r2_score(game_details_y_test, predictions))
    print("Lasso Mean Squared Error Loss (no hyper tuning): ",mean_squared_error(game_details_y_test, predictions))
    print("Lasoo Mean Absolute Error: (no hyper tuning) ",mean_absolute_error(game_details_y_test, predictions)) 

    # lasso regression with hyper tuning
    parameters = {'alpha': [0.0001,0.001,0.01,0.1,1]}
    lasso_hyper_tuned = GridSearchCV(lasso, parameters)
    lasso_hyper_tuned.fit(game_details_X_train, game_details_y_train)
    predictions = lasso_hyper_tuned.predict(game_details_X_test)

    print("Lasso R-Squared Value (hyper tuning): ", r2_score(game_details_y_test, predictions))
    print("Lasso Mean Squared Error Loss (hyper tuning): ",mean_squared_error(game_details_y_test, predictions))
    print("Lasso Mean Absolute Error: (hyper tuning) ",mean_absolute_error(game_details_y_test, predictions)) 
    print("Lasso Hypertuned alpha value for regularization: ", lasso_hyper_tuned.best_estimator_.alpha)
    print("-------------------------------------")
    
    error = mean_absolute_error(game_details_y_test, predictions)
    
    return (error, lasso_hyper_tuned, predictions)

In [None]:
# Model 3B: Ridge Regression (predicting plus/minus outcome)

def ridge_plus_minus(game_details_X_train,game_details_y_train, game_details_X_test, game_details_y_test):
    # ridge regression with no hyper tuning
    ridge = linear_model.Ridge()
    ridge.fit(game_details_X_train, game_details_y_train)
    predictions = ridge.predict(game_details_X_test)
    print("Ridge R-Squared Value (no hyper tuning): ", r2_score(game_details_y_test, predictions))
    print("Ridge Mean Squared Error Loss (no hyper tuning): ",mean_squared_error(game_details_y_test, predictions))
    print("Ridge Mean Absolute Error (no hyper tuning): ",mean_absolute_error(game_details_y_test, predictions)) 

    # ridge regression with hyper tuning
    parameters = {'alpha': [0.00001,0.0001,0.001,0.01,0.1,1.0]}
    ridge_hyper_tuned = GridSearchCV(ridge, parameters)
    ridge_hyper_tuned.fit(game_details_X_train, game_details_y_train)
    predictions = ridge_hyper_tuned.predict(game_details_X_test)

    print("Ridge R-Squared Value (hyper tuning): ", r2_score(game_details_y_test, predictions))
    print("Ridge Mean Squared Error Loss (hyper tuning): ", mean_squared_error(game_details_y_test, predictions))
    print("Ridge Mean Absolute Error: (hyper tuning) ", mean_absolute_error(game_details_y_test, predictions)) 
    print("Ridge Hypertuned alpha value for regularization: ", ridge_hyper_tuned.best_estimator_.alpha)
    print("-------------------------------------")
    
    error = mean_absolute_error(game_details_y_test, predictions)
    
    return (error, ridge_hyper_tuned, predictions)

In [None]:
# Model 4B: Elastic Net (predicting plus/minus outcome)

def elastic_plus_minus(game_details_X_train, game_details_y_train, game_details_X_test, game_details_y_test):
    # elastic net regression with no hyper tuning
    elastic = linear_model.ElasticNet()
    elastic.fit(game_details_X_train, game_details_y_train)
    predictions = elastic.predict(game_details_X_test)
    print("Elastic R-Squared Value (no hyper tuning): ", r2_score(game_details_y_test, predictions))
    print("Elastic Mean Squared Error Loss (no hyper tuning): ",mean_squared_error(game_details_y_test, predictions))
    print("Elastic Mean Absolute Error (no hyper tuning): ",mean_absolute_error(game_details_y_test, predictions)) 

    # elastic net regression with hyper tuning
    parameters = {'alpha': [0.0001,0.001,0.01,0.1,1], 'l1_ratio': [0.1,0.25,0.5,0.75,0.9]} # avoid l1_ratio of 0 or 1 since this is same as ridge or lasso
    elastic_hyper_tuned = GridSearchCV(elastic, parameters)
    elastic_hyper_tuned.fit(game_details_X_train, game_details_y_train)
    predictions = elastic_hyper_tuned.predict(game_details_X_test)
    print("Elastic R-Squared Value (hyper tuning): ", r2_score(game_details_y_test, predictions))
    print("Elastic Mean Squared Error Loss (hyper tuning): ", mean_squared_error(game_details_y_test, predictions))
    print("Elastic Mean Absolute Error: (hyper tuning) ", mean_absolute_error(game_details_y_test, predictions)) 
    print("Elastic Hypertuned alpha value for regularization: ", elastic_hyper_tuned.best_estimator_.alpha)
    print("Elastic Hypertuned l1 ratio value for regularization: ", elastic_hyper_tuned.best_estimator_.l1_ratio)
    print("-------------------------------------")
    
    error = mean_absolute_error(game_details_y_test, predictions)
    
    return (error, elastic_hyper_tuned, predictions)

In [None]:
# perform bootstrapping and computes the confidence interval for three performance measures for regression methods
def perform_bootstrapping_regression(number_bootstraps, game_details_df, game_detail_results_df, model):
    number_train_samples = int(len(game_details_df) * 0.60)
    number_test_samples = int(len(game_details_df) * 0.40)
    # lists to store computed statistics
    r_squared_list = []
    mean_squared_error_list = []
    mean_absolute_error_list = []
    # fit model and compute performance statistics per bootstrap sample
    # Code adapted from: https://machinelearningmastery.com/calculate-bootstrap-confidence-intervals-machine-learning-results-python/
    for i in range(number_bootstraps):
        print("Bootstrap Sample: ", i)
        # create the bootstrap and split the data with resampling  
        boot_train_x, boot_train_y = resample(game_details_df, game_detail_results_df, n_samples = number_train_samples)
        boot_train_chalg_ids = boot_train_x.index.values.tolist()

        current_test_dataframe = game_details_df[~game_details_df.index.isin(boot_train_chalg_ids)]
        current_test_responses = game_detail_results_df[~game_detail_results_df.index.isin(boot_train_chalg_ids)]    
        boot_test_x, boot_test_y = resample(current_test_dataframe, current_test_responses, n_samples = number_test_samples)

        # fit and predict on the current iteration bootstrap        
        model.fit(boot_train_x, boot_train_y)
        predictions = model.predict(boot_test_x)
    
        # compute and store the appropriate performance statistics 
        r_squared_list.append(r2_score(boot_test_y, predictions))
        mean_squared_error_list.append(mean_squared_error(boot_test_y, predictions))
        mean_absolute_error_list.append(mean_absolute_error(boot_test_y, predictions))
        
    r_squared_list.sort()
    mean_squared_error_list.sort()
    mean_absolute_error_list.sort()
    
    return (r_squared_list, mean_squared_error_list, mean_absolute_error_list)

In [None]:
# perform bootstrapping and computes the confidence interval for three performance measures for classification methods
def perform_bootstrapping_classification(number_bootstraps, game_details_df, game_detail_results_df, model):
    
    number_train_samples = int(len(game_details_df) * 0.60)
    number_test_samples = int(len(game_details_df) * 0.40)
    # lists to store computed statistics
    accuracy = []
    f_scores = []
    
    # fit model and compute performance statistics per bootstrap sample
    # Code adapted from: https://machinelearningmastery.com/calculate-bootstrap-confidence-intervals-machine-learning-results-python/
    for i in range(number_bootstraps):
        print("Bootstrap Sample: ", i)
        # create the bootstrap and split the data with resampling  
        boot_train_x, boot_train_y = resample(game_details_df, game_detail_results_df, n_samples = number_train_samples)
        boot_train_chalg_ids = boot_train_x.index.values.tolist()
        current_test_dataframe = game_details_df.drop(index = boot_train_chalg_ids)
        current_test_responses = game_detail_results_df.drop(index = boot_train_chalg_ids)
        boot_test_x, boot_test_y = resample(current_test_dataframe, current_test_responses, n_samples = number_test_samples)

        # fit and predict on the current iteration bootstrap
        model.fit(boot_train_x, boot_train_y)
        predictions = model.predict(boot_test_x)

        # compute and store the appropriate performance statistics 
        accuracy.append(accuracy_score(boot_test_y, predictions)) 
        f_scores.append(f1_score(boot_test_y, predictions, zero_division = 1, pos_label = 1))
        
    accuracy.sort()
    f_scores.sort()
    
    return (accuracy, f_scores)

In [None]:
# compute the confidence interval of the provided measurements 
def compute_confidence_interval(alpha, data):
    # code below from https://machinelearningmastery.com/calculate-bootstrap-confidence-intervals-machine-learning-results-python/
    p = ((1.0-alpha)/2.0) * 100
    lower = max(0.0, np.percentile(data, p))
    p = (alpha+((1.0-alpha)/2.0)) * 100
    upper = min(1.0, np.percentile(data, p))
    return(lower, upper) 

In [None]:
# determine the optimal k value for the games dataframe:

k_vals = list(range(1,18,2))

naive_bayes_scores = []
svm_scores = []
log_regression_scores = []
rand_forest_scores = []

# iterate over all k values and select k features, hypertune the model, and save the accuracies per model
for curr_k in k_vals:
    
    games_X_train, games_X_test, games_y_train, games_y_test = train_test_split(games_df, game_result, random_state=42)
    
    chi2_features = SelectKBest(chi2, k = curr_k)
    df_kbest_features = chi2_features.fit_transform(games_X_train, games_y_train)
    f = chi2_features.get_support(indices=True)
    games_X_train = games_X_train[games_X_train.columns[f]]

    test_kbest_features = chi2_features.transform(games_X_test)
    f = chi2_features.get_support(indices=True)
    games_X_test = games_X_test[games_X_test.columns[f]]

    score1 = gaussian_naive_bayes_games_outcome(games_X_train, games_y_train, games_X_test, games_y_test)[0]
    naive_bayes_scores.append(score1)
    
    score2 = svm_game_outcomes(games_X_train, games_y_train, games_X_test, games_y_test)[0] 
    svm_scores.append(score2)
    
    score3 = logistic_regression_game_outcomes(games_X_train, games_y_train, games_X_test, games_y_test)[0]
    log_regression_scores.append(score3)
    
    score4 = random_forest_game_outcomes(games_X_train, games_y_train, games_X_test, games_y_test)[0]
    rand_forest_scores.append(score4)

In [None]:
# output the accuracies for each possible k for the games dataframe

column_names = ["Number of Features", "Naive Bayes Acc.", "Linear SVM Acc.", "Logistic Regression Acc.", "Random Forest Acc."]

accuracies_per_k = pd.DataFrame(columns = column_names)

accuracies_per_k["Number of Features"] = k_vals
accuracies_per_k["Naive Bayes Acc."] = naive_bayes_scores
accuracies_per_k["Linear SVM Acc."] = svm_scores
accuracies_per_k["Logistic Regression Acc."] = log_regression_scores
accuracies_per_k["Random Forest Acc."] = rand_forest_scores

accuracies_per_k

In [None]:
# determine the optimal k value for the game details dataframe:

k_vals = list(range(1,22,2))

linear_regression_errors = []
ridge_errors = []
lasso_errors = []
elastic_errors = []

# iterate over all k values and select k features, hypertune the model, and save the accuracies per model
for curr_k in k_vals:
    
    game_details_X_train, game_details_X_test, game_details_y_train, game_details_y_test = train_test_split(game_details_df, game_details_result_df, random_state=42)
    
    chi2_features = SelectKBest(chi2, k = curr_k)
    df_kbest_features = chi2_features.fit_transform(game_details_X_train, game_details_y_train)
    f = chi2_features.get_support(indices=True)
    game_details_X_train = game_details_X_train[game_details_X_train.columns[f]]

    test_kbest_features = chi2_features.transform(game_details_X_test)
    f = chi2_features.get_support(indices=True)
    game_details_X_test = game_details_X_test[game_details_X_test.columns[f]]

    score1 = linear_regression_plus_minus(game_details_X_train, game_details_y_train, game_details_X_test, game_details_y_test)[0]
    linear_regression_errors.append(score1)
    
    score2 = lasso_plus_minus(game_details_X_train, game_details_y_train, game_details_X_test, game_details_y_test)[0]
    lasso_errors.append(score2)
    
    score3 = ridge_plus_minus(game_details_X_train, game_details_y_train, game_details_X_test, game_details_y_test)[0]
    ridge_errors.append(score3)
    
    score4 = elastic_plus_minus(game_details_X_train, game_details_y_train, game_details_X_test, game_details_y_test)[0]
    elastic_errors.append(score4)

In [None]:
# output the absolute errors for each possible k for the game details for the dataframe

column_names = ["Number of Features", "Linear Regression Errors", "Lasso Errors", "Ridge Errors", "Elastic Net Errors"]

errors_per_k = pd.DataFrame(columns = column_names)

errors_per_k["Number of Features"] = k_vals
errors_per_k["Linear Regression Errors"] = linear_regression_errors
errors_per_k["Lasso Errors"] = lasso_errors
errors_per_k["Ridge Errors"] = ridge_errors
errors_per_k["Elastic Net Errors"] = elastic_errors

errors_per_k


In [None]:
# create new data frames that use the optimal number of features

games_X_train, games_X_test, games_y_train, games_y_test = train_test_split(games_df, game_result, random_state=42)
game_details_X_train, game_details_X_test, game_details_y_train, game_details_y_test = train_test_split(game_details_df, game_details_result_df, random_state=42)

# Perform feature selection on the games dataframe for training and testing 
chi2_features = SelectKBest(chi2, k = 12)
df_kbest_features = chi2_features.fit_transform(games_X_train, games_y_train)
f = chi2_features.get_support(indices=True)
games_X_train = games_X_train[games_X_train.columns[f]]

test_kbest_features = chi2_features.transform(games_X_test)
f = chi2_features.get_support(indices=True)
games_X_test = games_X_test[games_X_test.columns[f]]

# Perform feature selection on the entire games dataset

test_kbest_features = chi2_features.transform(games_df)
f = chi2_features.get_support(indices=True)
games_df = games_df[games_df.columns[f]]

# Perform feature selection on the game details dataframe for training and testing

#scaler = MinMaxScaler()
#game_details_X_train = pd.DataFrame(scaler.fit_transform(game_details_X_train),index = game_details_X_train.index,columns = game_details_X_train.columns)
#game_details_X_test = pd.DataFrame(scaler.fit_transform(game_details_X_test),index = game_details_X_test.index,columns = game_details_X_test.columns)
#game_details_y_train = game_details_y_train.astype('float64')

chi2_features = SelectKBest(chi2, k = 15) 
df_kbest_features = chi2_features.fit_transform(game_details_X_train, game_details_y_train)
f = chi2_features.get_support(indices=True)
game_details_X_train = game_details_X_train[game_details_X_train.columns[f]]

test_kbest_features = chi2_features.transform(game_details_X_test)
f = chi2_features.get_support(indices=True)
game_details_X_test = game_details_X_test[game_details_X_test.columns[f]]

# Perform feature selection on the entire game details dataframe for use in bootstrapping

test_kbest_features = chi2_features.transform(game_details_df)
f = chi2_features.get_support(indices=True)
game_details_df = game_details_df[game_details_df.columns[f]]

In [None]:
# Perform data normalization before running final models 

scaler = StandardScaler()
games_X_train = pd.DataFrame(scaler.fit_transform(games_X_train),index = games_X_train.index,columns = games_X_train.columns)
games_X_test = pd.DataFrame(scaler.fit_transform(games_X_test),index = games_X_test.index,columns = games_X_test.columns)

#scaler = MinMaxScaler()
game_details_X_train = pd.DataFrame(scaler.fit_transform(game_details_X_train),index = game_details_X_train.index,columns = game_details_X_train.columns)
game_details_X_test = pd.DataFrame(scaler.fit_transform(game_details_X_test),index = game_details_X_test.index,columns = game_details_X_test.columns)

# drop the GAME ID column since it is no longer needed 
game_details_X_train = game_details_X_train.drop("GAME_ID", axis = 1)
game_details_X_test = game_details_X_test.drop("GAME_ID", axis = 1)

In [None]:
# Run all model results for predicting game outcomes (with normalized data)

gaussian_game_outcome = gaussian_naive_bayes_games_outcome(games_X_train, games_y_train, games_X_test, games_y_test)

svm_game_outcome = svm_game_outcomes(games_X_train, games_y_train, games_X_test, games_y_test)

logistic_reg_game_outcome = logistic_regression_game_outcomes(games_X_train, games_y_train, games_X_test, games_y_test)

random_forest_game_outcome = random_forest_game_outcomes(games_X_train, games_y_train, games_X_test, games_y_test)


In [None]:
# perform bootstrapping to build confidence intervals for the classification task

game_result = pd.DataFrame(game_result)
game_result["idx"] = games_df.index.values
game_result = game_result.set_index("idx")

# gaussian naive bayes boot strapping (game outcome prediction)
accuracy, f1score = perform_bootstrapping_classification(150, games_df, game_result["HOME_TEAM_WINS"], gaussian_game_outcome[1])
accuracy_l, accuracy_2 = compute_confidence_interval(0.95, accuracy) 
f1_score_1, f1_score_2 = compute_confidence_interval(0.95, f1score) 
print("95%% Confidence Interval for naive bayes accuracy Value: [%.4f, %.4f] " % (accuracy_l, accuracy_2))
print("95%% Confidence Interval for naive bayes F1 score: [%.4f, %.4f] " % (f1_score_1, f1_score_2))

# linear svm boot strapping (game outcome prediction)
accuracy, f1score = perform_bootstrapping_classification(150, games_df, game_result["HOME_TEAM_WINS"], svm_game_outcome[1])
accuracy_l, accuracy_2 = compute_confidence_interval(0.95, accuracy) 
f1_score_1, f1_score_2 = compute_confidence_interval(0.95, f1score) 
print("95%% Confidence Interval for linear svm accuracy Value: [%.4f, %.4f] " % (accuracy_l, accuracy_2))
print("95%% Confidence Interval for linear svm F1 score: [%.4f, %.4f] " % (f1_score_1, f1_score_2))

# logistic regression boot strapping (game outcome prediction)
accuracy, f1score = perform_bootstrapping_classification(150, games_df, game_result["HOME_TEAM_WINS"], logistic_reg_game_outcome[1])
accuracy_l, accuracy_2 = compute_confidence_interval(0.95, accuracy) 
f1_score_1, f1_score_2 = compute_confidence_interval(0.95, f1score) 
print("95%% Confidence Interval for logistic regression accuracy Value: [%.4f, %.4f] " % (accuracy_l, accuracy_2))
print("95%% Confidence Interval for logistic regression F1 score: [%.4f, %.4f] " % (f1_score_1, f1_score_2))

# random forest boot strapping (game outcome prediction)
accuracy, f1score = perform_bootstrapping_classification(150, games_df, game_result["HOME_TEAM_WINS"], random_forest_game_outcome[1])
accuracy_l, accuracy_2 = compute_confidence_interval(0.95, accuracy) 
f1_score_1, f1_score_2 = compute_confidence_interval(0.95, f1score) 
print("95%% Confidence Interval for random forest accuracy Value: [%.4f, %.4f] " % (accuracy_l, accuracy_2))
print("95%% Confidence Interval for random forest F1 score: [%.4f, %.4f] " % (f1_score_1, f1_score_2))

In [None]:
# Run all model results for predicting plus minus outcomes (with normalized data)

linear_plus_minus_outcome = linear_regression_plus_minus(game_details_X_train, game_details_y_train, game_details_X_test, game_details_y_test)

lasso_plus_minus_outcome = lasso_plus_minus(game_details_X_train, game_details_y_train, game_details_X_test, game_details_y_test)

ridge_reg_plus_minus_outcome = ridge_plus_minus(game_details_X_train, game_details_y_train, game_details_X_test, game_details_y_test)

elastic_forest_plus_minus_outcome = elastic_plus_minus(game_details_X_train, game_details_y_train, game_details_X_test, game_details_y_test)


In [None]:
# perform bootstrapping to build confidence intervals for the regression task

game_details_result_df = pd.DataFrame(game_details_result_df)
game_details_result_df["idx"] = game_details_df.index.values
game_details_result_df = game_details_result_df.set_index("idx")
# gaussian naive bayes boot strapping (game outcome prediction)
r_squared, mean_squared, mean_absolute = perform_bootstrapping_regression(150, game_details_df, game_details_result_df["PLUS_MINUS"], linear_plus_minus_outcome[1])
r_squared_1, r_squared_2 = compute_confidence_interval(0.95, r_squared) 
mean_squared_1, mean_squared_2 = compute_confidence_interval(0.95, mean_squared)
mean_absolute_1, mean_absolute_2 = compute_confidence_interval(0.95, mean_absolute) 
print("95%% Confidence Interval for least squares r squared Value: [%.4f, %.4f] " % (r_squared_1, r_squared_2))
print("95%% Confidence Interval for least squares mean squared error: [%.4f, %.4f] " % (mean_squared_1, mean_squared_2))
print("95%% Confidence Interval for least squares mean absolute error: [%.4f, %.4f] " % (mean_absolute_1, mean_absolute_2))

# linear svm boot strapping (game outcome prediction)
r_squared, mean_squared, mean_absolute = perform_bootstrapping_regression(150, game_details_df, game_details_result_df["PLUS_MINUS"], lasso_plus_minus_outcome[1])
r_squared_1, r_squared_2 = compute_confidence_interval(0.95, r_squared) 
mean_squared_1, mean_squared_2 = compute_confidence_interval(0.95, mean_squared)
mean_absolute_1, mean_absolute_2 = compute_confidence_interval(0.95, mean_absolute) 
print("95%% Confidence Interval for lasso r squared Value: [%.4f, %.4f] " % (r_squared_1, r_squared_2))
print("95%% Confidence Interval for lasso mean squared error: [%.4f, %.4f] " % (mean_squared_1, mean_squared_2))
print("95%% Confidence Interval for lasso mean absolute error: [%.4f, %.4f] " % (mean_absolute_1, mean_absolute_2))

# logistic regression boot strapping (game outcome prediction)
r_squared, mean_squared, mean_absolute = perform_bootstrapping_regression(150, game_details_df, game_details_result_df["PLUS_MINUS"], ridge_reg_plus_minus_outcome[1])
r_squared_1, r_squared_2 = compute_confidence_interval(0.95, r_squared) 
mean_squared_1, mean_squared_2 = compute_confidence_interval(0.95, mean_squared)
mean_absolute_1, mean_absolute_2 = compute_confidence_interval(0.95, mean_absolute) 
print("95%% Confidence Interval for ridge r squared Value: [%.4f, %.4f] " % (r_squared_1, r_squared_2))
print("95%% Confidence Interval for ridge mean squared error: [%.4f, %.4f] " % (mean_squared_1, mean_squared_2))
print("95%% Confidence Interval for ridge mean absolute error: [%.4f, %.4f] " % (mean_absolute_1, mean_absolute_2))

# random forest boot strapping (game outcome prediction)
r_squared, mean_squared, mean_absolute = perform_bootstrapping_regression(150, game_details_df, game_details_result_df["PLUS_MINUS"], elastic_forest_plus_minus_outcome[1])
r_squared_1, r_squared_2 = compute_confidence_interval(0.95, r_squared) 
mean_squared_1, mean_squared_2 = compute_confidence_interval(0.95, mean_squared)
mean_absolute_1, mean_absolute_2 = compute_confidence_interval(0.95, mean_absolute) 
print("95%% Confidence Interval for elastic net r squared Value: [%.4f, %.4f] " % (r_squared_1, r_squared_2))
print("95%% Confidence Interval for elastic net mean squared error: [%.4f, %.4f] " % (mean_squared_1, mean_squared_2))
print("95%% Confidence Interval for elastic net mean absolute error: [%.4f, %.4f] " % (mean_absolute_1, mean_absolute_2))

In [None]:
# determine which features are most important for predicting game outcome per model

# extract/find important features for naive bayes

   # not trivial/necessary to do -> according to Ed post

# extract/find important features for linear svm
[importances] = abs(svm_game_outcome[1].best_estimator_.coef_)
features = games_X_train.columns
svm_features_and_importances = {'Top features':features, 'importances':importances}
svm_feature_importance_df = pd.DataFrame(data = svm_features_and_importances).sort_values("importances", ascending = False)
# output top 5 importances 
print(svm_feature_importance_df.head(10))
print("---------------------------------")

# extract/find important features for logistic regression 
[importances] = abs(logistic_reg_game_outcome[1].best_estimator_.coef_)
features = games_X_train.columns
log_reg_features_and_importances = {'Top features':features, 'importances':importances}
log_reg_feature_importance_df = pd.DataFrame(data = log_reg_features_and_importances).sort_values("importances", ascending = False)
# output top 5 importances 
print(log_reg_feature_importance_df.head(10))
print("---------------------------------")

# extract/find important features for random forest
importances = random_forest_game_outcome[1].best_estimator_.feature_importances_
features = games_X_train.columns
random_forest_features_and_importances = {'Top features':features, 'importances':importances}
random_forest_feature_importance_df = pd.DataFrame(data = random_forest_features_and_importances).sort_values("importances", ascending = False)
# output top 5 importances 
print(random_forest_feature_importance_df.head(10))
print("---------------------------------")

In [None]:
# determine which features are most important for predicting player plus minus statistic per model 

# extract/find important features for plain linear regression 
importances = abs(linear_plus_minus_outcome[1].coef_)
features = game_details_df.columns[1:]
linear_reg_features_and_importances = {'Top features':features, 'importances':importances}
linear_reg_feature_importance_df = pd.DataFrame(data = linear_reg_features_and_importances).sort_values("importances", ascending = False)
# output top 5 importances 
print(linear_reg_feature_importance_df.head(10))
print("---------------------------------")

# extract/find important features for lasso regression 
importances = abs(lasso_plus_minus_outcome[1].best_estimator_.coef_)
features = game_details_df.columns[1:]
lasso_features_and_importances = {'Top features':features, 'importances':importances}
lasso_feature_importance_df = pd.DataFrame(data = lasso_features_and_importances).sort_values("importances", ascending = False)
# output top 5 importances 
print(lasso_feature_importance_df.head(10))
print("---------------------------------")

# extract/find important features for ridge regression 
importances = abs(ridge_reg_plus_minus_outcome[1].best_estimator_.coef_)
features = game_details_df.columns[1:]
ridge_features_and_importances = {'Top features':features, 'importances':importances}
ridge_feature_importance_df = pd.DataFrame(data = ridge_features_and_importances).sort_values("importances", ascending = False)
# output top 5 importances 
print(ridge_feature_importance_df.head(10))
print("---------------------------------")

# extract/find important features for elastic net
importances = (elastic_forest_plus_minus_outcome[1].best_estimator_.coef_)
features = game_details_df.columns[1:]
elastic_net_features_and_importances = {'Top features':features, 'importances':importances}
elastic_net_feature_importance_df = pd.DataFrame(data = elastic_net_features_and_importances).sort_values("importances", ascending = False)
# output top 5 importances 
print(elastic_net_feature_importance_df.head(10))
print("---------------------------------")

In [None]:
# remove the points scored by home and away teams to see the impact on game prediction

# drop the points scored columns

games_df = games_df.drop(['PTS_home', 'PTS_away'], axis=1)

# split the data after dropping the points scored columns
games_X_train, games_X_test, games_y_train, games_y_test = train_test_split(games_df, game_result, random_state=42)

# standardize the data
scaler = StandardScaler()
games_X_train = pd.DataFrame(scaler.fit_transform(games_X_train),index = games_X_train.index,columns = games_X_train.columns)
games_X_test = pd.DataFrame(scaler.fit_transform(games_X_test),index = games_X_test.index,columns = games_X_test.columns)

# Perform feature selection on the games dataframe for training and testing 
chi2_features = SelectKBest(chi2, k = 10)
df_kbest_features = chi2_features.fit_transform(games_X_train, games_y_train)
f = chi2_features.get_support(indices=True)
games_X_train = games_X_train[games_X_train.columns[f]]

test_kbest_features = chi2_features.transform(games_X_test)
f = chi2_features.get_support(indices=True)
games_X_test = games_X_test[games_X_test.columns[f]]

# run all four models 
gaussian_game_outcomes = gaussian_naive_bayes_games_outcome(games_X_train, games_y_train, games_X_test, games_y_test)
svm_lin_game_outcomes = svm_game_outcomes(games_X_train, games_y_train, games_X_test, games_y_test)
logistic_reg_game_outcomes = logistic_regression_game_outcomes(games_X_train, games_y_train, games_X_test, games_y_test)
random_forest_game_outcome = random_forest_game_outcomes(games_X_train, games_y_train, games_X_test, games_y_test)
