In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# ift6758/data/milestone2/q3_baseline
import sys
sys.path.append('../ift6758/data/milestone2')

    

In [19]:
from q6_plot import read_all_features,\
                        plot_models

ModuleNotFoundError: No module named 'q6_plot'

In [40]:
# part 3 - q1 we got 90.39%, 
# part 5.2 got 91.27%
# part 6 needs to be greater than 90.39%.
# part 6 should be better than the untuned baselines, 
# but it doesn't need to be better than what you get feature selection/hyperparameter tuning.

# Approach 1: Decision Tree/MLPClassifier?
# Approach 2: Hyperparameter Tuning for Decision Tree/MLPClassifier?
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV
# Approach 3: More advanced feature selection strategies (PCA?) https://machinelearningmastery.com/feature-selection-machine-learning-python/
# Approach 4: Combine approaches 1-3?

In [22]:
# dataset = pd.read_csv('/Users/xiaoxinzhou/Documents/IFT6758_M2_CSV_data/all_data_categorical.csv')
dataset = pd.read_csv('/Users/sunjiaao/Courses/IFT6758/m2_CSV_data/all_data_q4_categorical.csv')

X = dataset.iloc[: , :-1]
y = dataset[['Is Goal']]

In [23]:
##############################################################################
# Approach 1: Decision Tree Classifier
##############################################################################

In [76]:
def approach_1(X, y):
    # Create a training and validation split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=50)    
    
    clf = DecisionTreeClassifier(
                        # max_leaf_nodes=3, 
                        # max_depth=30,
                        # random_state=0
    )
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))  
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)    
    print(f"roc_auc: {roc_auc}")
    
approach_1(X, y)

Accuracy: 84.39%
roc_auc: 0.5681074764127346


In [105]:
###################################################################################################
# Approach 2: Decision Tree Classifier with Randomized search on hyper parameters and Regularization
###################################################################################################

from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV


def approach_2(X, y):
    # Create a training and validation split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=50)    
    
    dtc = DecisionTreeClassifier()
    
    # https://machinelearningmastery.com/hyperparameter-optimization-with-random-search-and-grid-search/
    
    space = dict()
    space['splitter'] = ['best', 'random']
    space['max_depth'] = list(range(2, 50)) #np.linspace(1, 32, 32, endpoint=True) #randint(10, 50)
    space['min_samples_split'] = np.linspace(0.1, 1.0, 10, endpoint=True)
    space['min_samples_leaf'] = np.linspace(0.1, 0.5, 5, endpoint=True)
    space['max_features'] = list(range(1, X_train.shape[1]))
    space['max_leaf_nodes'] = list(range(2, 10))
    
    clf = RandomizedSearchCV(dtc, space, random_state=50, verbose=3)
    
    search = clf.fit(X_train, y_train)
    print(search.best_params_)
    y_pred = search.predict(X_test)
    
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))  
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)    
    print(f"roc_auc: {roc_auc}")
    
approach_2(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END max_depth=48, max_features=4, max_leaf_nodes=2, min_samples_leaf=0.1, min_samples_split=0.9, splitter=best;, score=0.908 total time=   0.1s
[CV 2/5] END max_depth=48, max_features=4, max_leaf_nodes=2, min_samples_leaf=0.1, min_samples_split=0.9, splitter=best;, score=0.908 total time=   0.1s
[CV 3/5] END max_depth=48, max_features=4, max_leaf_nodes=2, min_samples_leaf=0.1, min_samples_split=0.9, splitter=best;, score=0.908 total time=   0.1s
[CV 4/5] END max_depth=48, max_features=4, max_leaf_nodes=2, min_samples_leaf=0.1, min_samples_split=0.9, splitter=best;, score=0.908 total time=   0.0s
[CV 5/5] END max_depth=48, max_features=4, max_leaf_nodes=2, min_samples_leaf=0.1, min_samples_split=0.9, splitter=best;, score=0.908 total time=   0.1s
[CV 1/5] END max_depth=13, max_features=9, max_leaf_nodes=9, min_samples_leaf=0.1, min_samples_split=0.30000000000000004, splitter=best;, score=0.908 total time=   0.2s
[CV 2

In [117]:
####################################################################################
# Approach 3: Decision Tree Classifier with PCA Feature Reduction
####################################################################################

import xgboost as xgb
from sklearn.decomposition import PCA


def approach_3(X, y):
#     dataset = pd.read_csv('/Users/sunjiaao/Courses/IFT6758/m2_CSV_data/all_data_q4_categorical.csv')

#     X = dataset[['X-Coordinate', 'Y-Coordinate',
#              'Shot Distance', 'Shot Angle', 
#              'Shot Type', 
#              'Was Net Empty', 
#              'Last Event Type', 
#              'Last X-Coordinate', 'Last Y-Coordinate', 
#              'Time from Last Event (seconds)', 
#              'Distance from Last Event', 
#              'Is Rebound',
#              'Change in Shot Angle', 
#              'Speed'
#             ]]
    
#     y = dataset[['Is Goal']]
    
    # Create a training and validation split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=50)    
    
    print(X_train.shape)
    print(X_train)
    pca = PCA(n_components=3)
    X_train_transformed = pca.fit_transform(X_train)
    
    print(X_train_transformed.shape)
    print(X_train_transformed)
    
    clf = DecisionTreeClassifier(
                        # max_leaf_nodes=3, 
                        # max_depth=30,
                        # random_state=0
    )
    
    clf.fit(X_train_transformed, y_train)
    
    X_test_transformed = pca.fit_transform(X_test)
    
    y_pred = clf.predict(X_test_transformed)
    
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))  
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)    
    print(f"roc_auc: {roc_auc}")
    
approach_3(X, y)

(226136, 12)
        X-Coordinate  Y-Coordinate  Shot Distance  Shot Angle  Shot Type  \
160597           -32           -25      62.241465  113.682088          0   
156619           -45            15      46.486557   71.175290          0   
253246           -40           -11      50.219518  102.652557          2   
88000             57           -19      37.215588   59.300277          0   
10516            -62           -12      29.546573  113.962489          0   
...              ...           ...            ...         ...        ...   
165959            72             4      17.464249  103.240520          0   
186463            57            12      34.176015  110.556045          0   
153709            61             6      28.635642  102.094757          0   
239499            72           -11      89.988888    0.000000          1   
103904            80            -1       9.055385   83.659808          0   

        Was Net Empty  Last Event Type  Last X-Coordinate  Last Y-Coordina

In [104]:
####################################################################################
# Approach 4: Decision Tree Classifier combining approaches 2 and 3
####################################################################################

def approach_4(X, y):
    # Create a training and validation split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=50)    
    
    pca = PCA(n_components=3)
    X_train_transformed = pca.fit_transform(X_train)
    
    dtc = DecisionTreeClassifier()
    
    # https://machinelearningmastery.com/hyperparameter-optimization-with-random-search-and-grid-search/
    
    space = dict()
    space['splitter'] = ['best', 'random']
    space['max_depth'] = list(range(2, 50)) #np.linspace(1, 32, 32, endpoint=True) #randint(10, 50)
    space['min_samples_split'] = np.linspace(0.1, 1.0, 10, endpoint=True)
    space['min_samples_leaf'] = np.linspace(0.1, 0.5, 5, endpoint=True)
    space['max_features'] = list(range(1, X_train.shape[1]))
    space['max_leaf_nodes'] = list(range(2, 10))
    
    clf = RandomizedSearchCV(dtc, space, random_state=50, verbose=3)
    
    search = clf.fit(X_train_transformed, y_train)
    
    X_test_transformed = pca.fit_transform(X_test)
    
    y_pred = search.predict(X_test_transformed)
    
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))  
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)    
    print(f"roc_auc: {roc_auc}")
    
approach_4(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END max_depth=48, max_features=4, max_leaf_nodes=2, min_samples_leaf=0.1, min_samples_split=0.9, splitter=best;, score=nan total time=   0.0s
[CV 2/5] END max_depth=48, max_features=4, max_leaf_nodes=2, min_samples_leaf=0.1, min_samples_split=0.9, splitter=best;, score=nan total time=   0.0s
[CV 3/5] END max_depth=48, max_features=4, max_leaf_nodes=2, min_samples_leaf=0.1, min_samples_split=0.9, splitter=best;, score=nan total time=   0.0s
[CV 4/5] END max_depth=48, max_features=4, max_leaf_nodes=2, min_samples_leaf=0.1, min_samples_split=0.9, splitter=best;, score=nan total time=   0.0s
[CV 5/5] END max_depth=48, max_features=4, max_leaf_nodes=2, min_samples_leaf=0.1, min_samples_split=0.9, splitter=best;, score=nan total time=   0.0s
[CV 1/5] END max_depth=13, max_features=9, max_leaf_nodes=9, min_samples_leaf=0.1, min_samples_split=0.30000000000000004, splitter=best;, score=nan total time=   0.0s
[CV 2/5] END max_

35 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/sunjiaao/opt/anaconda3/envs/ift6758-conda-env/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/sunjiaao/opt/anaconda3/envs/ift6758-conda-env/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 937, in fit
    super().fit(
  File "/Users/sunjiaao/opt/anaconda3/envs/ift6758-conda-env/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 308, in fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: max_features must b

In [49]:
# # ref: https://medium.com/@mohtedibf/indepth-parameter-tuning-for-decision-tree-6753118a03c3
# def tree_tune_max_path():
    
#     X_train, X_test, y_train, y_test = train_test_split(X,
#                                                         y,
#                                                         test_size=0.20,
#                                                         random_state=50) 
    
#     max_depths = np.linspace(1, 50, 50, endpoint=True)
#     train_results = []
#     test_results = []
    
#     for max_depth in max_depths:
#         dt = DecisionTreeClassifier(max_depth=max_depth)
#         dt.fit(X_train, y_train)
#         train_pred = dt.predict(X_train)
        
#         false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
#         roc_auc = auc(false_positive_rate, true_positive_rate)

#         # Add auc score to previous train results
#         train_results.append(roc_auc)
#         y_pred = dt.predict(X_test)
#         false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
#         roc_auc = auc(false_positive_rate, true_positive_rate)


#         # Add auc score to previous test results
#         test_results.append(roc_auc)
    
#     from matplotlib.legend_handler import HandlerLine2D
#     line1, = plt.plot(max_depths, train_results, 'b', label="Train AUC")
#     line2, = plt.plot(max_depths, test_results, 'r', label="Test AUC")
    
#     plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
#     plt.ylabel('AUC score')
#     plt.xlabel('Tree depth')
#     plt.show()   
# tree_tune_max_path()    

In [50]:
plot_models(X,y,'decision_tree')

NameError: name 'plot_models' is not defined

In [51]:
# ##############################################################################
# # Approch 2: Regularization: Lasso Regression
# # 
# # ref: https://harish-reddy.medium.com/regularization-in-python-699cfbad8622
# ##############################################################################
# from sklearn.linear_model import Lasso
# from sklearn.metrics import r2_score

# def lasso_train(X, y):
    
#     # Create a training and validation split
#     X_train, X_test, y_train, y_test = train_test_split(X,
#                                                         y,
#                                                         test_size=0.20,
#                                                         random_state=50) 
    
#     lassoreg = Lasso(alpha=0.001, normalize=True)
#     lassoreg.fit(X_train, y_train)
    
#     print(f"Lasso score: {lassoreg.score(X_test, y_test)}")
    
#     # y_pred = lassoreg.predict(X_test)
    
#     # accuracy = metrics.accuracy_score(y_test, y_pred)
#     # print("Accuracy: %.2f%%" % (accuracy * 100.0))

#     # print("R-Square Value",r2_score(y_test,y_pred))
#     # print("\n")
#     # print ("mean_absolute_error :",metrics.mean_absolute_error(y_test, y_pred))
#     # print("\n")
#     # print ("mean_squared_error : ",metrics.mean_squared_error(y_test, y_pred))
#     # print("\n")
#     # print ("root_mean_squared_error : ",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    
# lasso_train(X, y)    

In [52]:
# ##############################################################################
# # Approch 3: Hyperparameter tuning, cross validation strategies 
# # 
# # ref: 
# # - https://scikit-learn.org/stable/modules/cross_validation.html
# # - https://www.jeremyjordan.me/hyperparameter-tuning/
# # - https://ai.plainenglish.io/hyperparameter-tuning-of-decision-tree-classifier-using-gridsearchcv-2a6ebcaffeda
# ##############################################################################
# from sklearn.model_selection import GridSearchCV

# def tree2_train(X, y, features=['Distance from Net']):
#     # Create a training and validation split
#     X_train, X_test, y_train, y_test = train_test_split(X[features],
#                                                         y,
#                                                         test_size=0.20,
#                                                         random_state=50)    
    
#     clf = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0)
#     clf.fit(X_train, y_train)
    
#     # Cross validation with 10 folds
#     scores = cross_val_score(clf, X, y, cv=10) 
#     print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
    
#     # Grid search
#     param_dict = {
#         'criterion': ['gini', 'entropy'],
#         'max_depth': range(1, 10),
#         'min_samples_split': range(1, 10),
#         'min_samples_leaf': range(1, 5)
#         }
    
#     grid = GridSearchCV(clf,
#                        param_grid=param_dict,
#                        cv=10,
#                        verbose=1,
#                        n_jobs=-1)
    
#     grid.fit(X_train, y_train)
#     print(f"grid best params: {grid.best_params_}")
#     print(f"grid best estimator: {grid.best_estimator_}")
#     print(f"grid best score: {grid.best_score_}")
    
# tree2_train(X, y)    

In [17]:
####################################################################################
# Approach 3: Hand Select Feature Selection + Some Existing Feature Selection Method
####################################################################################
import xgboost as xgb
from sklearn.decomposition import PCA


def approach_3(X, y):
    # Create a training and validation split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=50)    
    
    clf = DecisionTreeClassifier(
                        # max_leaf_nodes=3, 
                        # max_depth=30,
                        # random_state=0
    )
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))  
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)    
    print(f"roc_auc: {roc_auc}")
    
approach_3(X, y)



# Read CSV files
dataset = pd.read_csv('/Users/sunjiaao/Courses/IFT6758/m2_CSV_data/all_data_q4_categorical.csv')

# Separate features and labels 
X = dataset[['X-Coordinate', 'Y-Coordinate',
             'Shot Distance', 'Shot Angle', 
             'Shot Type', 
             'Was Net Empty', 
             'Last Event Type', 
             'Last X-Coordinate', 'Last Y-Coordinate', 
             'Time from Last Event (seconds)', 
             'Distance from Last Event', 
             # 'Is Rebound',
             # 'Change in Shot Angle', 
             'Speed'
            ]]

y = dataset[['Is Goal']]

# Create a training and validation split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=50)

model = xgb.XGBClassifier()

model.fit(X_train, y_train)

# Make predictions for test data
y_test_pred = model.predict(X_test)
y_test = y_test.to_numpy().flatten()

# Evaluate predictions
accuracy = metrics.accuracy_score(y_test, y_test_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy: 91.28%


In [38]:
from sklearn.neural_network import MLPClassifier


# Read CSV files
dataset = pd.read_csv('/Users/sunjiaao/Courses/IFT6758/m2_CSV_data/all_data_q4_categorical.csv')

# # Separate features and labels 
# X = dataset[['eventIdx', 'game_id', 'Game Seconds', 'Game Period', 'X-Coordinate', 'Y-Coordinate',
#            'Shot Distance', 'Shot Angle', 'Shot Type', 'Was Net Empty', 'Last Event Type', 'Last X-Coordinate',
#            'Last Y-Coordinate', 'Time from Last Event (seconds)', 'Distance from Last Event', 'Is Rebound',
#            'Change in Shot Angle', 'Speed']]

# Separate features and labels 
X = dataset[['X-Coordinate', 'Y-Coordinate',
             'Shot Distance', 'Shot Angle', 
             'Shot Type', 
             'Was Net Empty', 
             'Last Event Type', 
             'Last X-Coordinate', 'Last Y-Coordinate', 
             'Time from Last Event (seconds)', 
             'Distance from Last Event', 
             # 'Is Rebound',
             # 'Change in Shot Angle', 
             'Speed'
            ]]

y = dataset[['Is Goal']].values.ravel()

# Create a training and validation split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=50)

clf = MLPClassifier(hidden_layer_sizes=(8, 8, 8), 
                    activation='relu',
                    solver='adam', 
                    alpha=1e-5,
                    max_iter=200,
                    random_state=1
                   )

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))  

Accuracy: 91.23%
