# Machine Learning Models Showcase

### Load data from database

In [1]:
# imports
import pandas as pd
import sklearn as skl
import psycopg2
import config as c

# import for multiple output per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# connection to database
connection = psycopg2.connect(
    host = c.host,
    port = c.port,
    user = c.user,
    password = c.password,
    database = c.database
    )
cursor=connection.cursor()

In [146]:
data_sql = """
SELECT *
FROM comprehensive_dataset;
"""

#  load in tables as dataframes
data_df = pd.read_sql(data_sql, con=connection)
data_df.head()

Unnamed: 0,GUID,Name,HallOfFameStatus,HallofFameClass,YearDrafted,TO_YEAR,Years_Played,HOF_Elgibility_Year,Pick,Team,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF
0,MIKENILES1980,Mike Niles,Not Inducted,,1980,1980,1,1985,,,...,0.8,45.9,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4
1,WAYNEROBINSON1980,Wayne Robinson,Not Inducted,,1980,1980,1,1985,31.0,LAL,...,3.0,72.9,1.4,2.2,3.6,1.4,0.6,0.3,1.8,7.8
2,BILLYREID1980,Billy Reid,Not Inducted,,1980,1980,1,1985,182.0,GSW,...,0.7,56.4,0.5,0.6,1.0,1.2,0.6,0.1,1.3,2.8
3,ALEXBRADLEY1981,Alex Bradley,Not Inducted,,1981,1981,1,1986,86.0,NYK,...,1.2,60.4,0.8,0.9,1.7,0.3,0.3,0.1,0.7,3.4
4,GARRYWITTS1981,Garry Witts,Not Inducted,,1981,1981,1,1986,103.0,WSB,...,0.9,82.5,0.6,0.7,1.3,0.8,0.4,0.1,0.8,3.8


In [147]:
data_df.dtypes

GUID                    object
Name                    object
HallOfFameStatus        object
HallofFameClass        float64
YearDrafted              int64
TO_YEAR                  int64
Years_Played             int64
HOF_Elgibility_Year      int64
Pick                   float64
Team                    object
College                 object
GP                       int64
MIN                    float64
PTS                    float64
FGM                    float64
FGA                    float64
FG%                    float64
3P_Made                float64
3PA                    float64
3P%                    float64
FTM                    float64
FTA                    float64
FT%                    float64
OREB                   float64
DREB                   float64
REB                    float64
AST                    float64
STL                    float64
BLK                    float64
TOV                    float64
EFF                    float64
dtype: object

## Preprocessing 

In [148]:
# get_dummies to encode HallofFameClass column
data_df = pd.get_dummies(data_df, prefix='HOF', columns=['HallOfFameStatus'])
data_df.drop(['HOF_Not Inducted'], axis=1, inplace=True) # remove extra get_dummies column
data_df.head()

Unnamed: 0,GUID,Name,HallofFameClass,YearDrafted,TO_YEAR,Years_Played,HOF_Elgibility_Year,Pick,Team,College,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,HOF_Hall of Fame Member
0,MIKENILES1980,Mike Niles,,1980,1980,1,1985,,,,...,45.9,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4,0
1,WAYNEROBINSON1980,Wayne Robinson,,1980,1980,1,1985,31.0,LAL,Virginia Polytechnic Institute and State Unive...,...,72.9,1.4,2.2,3.6,1.4,0.6,0.3,1.8,7.8,0
2,BILLYREID1980,Billy Reid,,1980,1980,1,1985,182.0,GSW,University of San Francisco,...,56.4,0.5,0.6,1.0,1.2,0.6,0.1,1.3,2.8,0
3,ALEXBRADLEY1981,Alex Bradley,,1981,1981,1,1986,86.0,NYK,Villanova University,...,60.4,0.8,0.9,1.7,0.3,0.3,0.1,0.7,3.4,0
4,GARRYWITTS1981,Garry Witts,,1981,1981,1,1986,103.0,WSB,College of the Holy Cross,...,82.5,0.6,0.7,1.3,0.8,0.4,0.1,0.8,3.8,0


In [149]:
# confirm HOF members encoded for 0
print(data_df.loc[data_df['GUID'] == 'MICHAELJORDAN1984'])
# This confirms HOF members = 1, non members = 0

                  GUID            Name  HallofFameClass  YearDrafted  TO_YEAR  \
597  MICHAELJORDAN1984  Michael Jordan           2009.0         1984     2002   

     Years_Played  HOF_Elgibility_Year  Pick Team  \
597            19                 2007   3.0  CHI   

                          College  ...   FT%  OREB  DREB  REB  AST  STL  BLK  \
597  University of North Carolina  ...  84.5   2.0   4.5  6.5  5.9  2.4  0.8   

     TOV   EFF  HOF_Hall of Fame Member  
597  3.5  29.2                        1  

[1 rows x 31 columns]


In [150]:
data_df.dtypes

GUID                        object
Name                        object
HallofFameClass            float64
YearDrafted                  int64
TO_YEAR                      int64
Years_Played                 int64
HOF_Elgibility_Year          int64
Pick                       float64
Team                        object
College                     object
GP                           int64
MIN                        float64
PTS                        float64
FGM                        float64
FGA                        float64
FG%                        float64
3P_Made                    float64
3PA                        float64
3P%                        float64
FTM                        float64
FTA                        float64
FT%                        float64
OREB                       float64
DREB                       float64
REB                        float64
AST                        float64
STL                        float64
BLK                        float64
TOV                 

In [151]:
# save
# data_df.to_csv('ml_encoded_data.csv')

In [152]:
# drop columns that will likely error/confuse the ML models
data_df.drop(['GUID', 'Name', 'Team', 'College', # dropping object columns 
              'HallofFameClass', 'YearDrafted', 'TO_YEAR', 'HOF_Elgibility_Year', 'GP', 'MIN' # non performance stats
                ], axis=1, inplace=True)
data_df.head()

Unnamed: 0,Years_Played,Pick,PTS,FGM,FGA,FG%,3P_Made,3PA,3P%,FTM,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,HOF_Hall of Fame Member
0,1,,2.6,1.1,3.1,34.8,0.0,0.1,50.0,0.4,...,45.9,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4,0
1,1,31.0,7.9,2.9,6.3,46.0,0.0,0.1,0.0,2.2,...,72.9,1.4,2.2,3.6,1.4,0.6,0.3,1.8,7.8,0
2,1,182.0,3.2,1.4,3.1,45.4,0.0,0.1,0.0,0.4,...,56.4,0.5,0.6,1.0,1.2,0.6,0.1,1.3,2.8,0
3,1,86.0,3.5,1.4,2.6,52.4,0.0,0.0,0.0,0.7,...,60.4,0.8,0.9,1.7,0.3,0.3,0.1,0.7,3.4,0
4,1,103.0,2.9,1.1,1.8,58.3,0.0,0.0,50.0,0.7,...,82.5,0.6,0.7,1.3,0.8,0.4,0.1,0.8,3.8,0


In [153]:
# check for NaNs
data_df.isnull().values.any()

True

In [154]:
# where are the NaNs?
data_df.count()

Years_Played               1217
Pick                        915
PTS                        1217
FGM                        1217
FGA                        1217
FG%                        1217
3P_Made                    1217
3PA                        1217
3P%                        1217
FTM                        1217
FTA                        1217
FT%                        1217
OREB                       1217
DREB                       1217
REB                        1217
AST                        1217
STL                        1217
BLK                        1217
TOV                        1217
EFF                        1217
HOF_Hall of Fame Member    1217
dtype: int64

In [155]:
# fill NaNs with 0
data_df['Pick'] = data_df['Pick'].fillna(0)
data_df.count()

Years_Played               1217
Pick                       1217
PTS                        1217
FGM                        1217
FGA                        1217
FG%                        1217
3P_Made                    1217
3PA                        1217
3P%                        1217
FTM                        1217
FTA                        1217
FT%                        1217
OREB                       1217
DREB                       1217
REB                        1217
AST                        1217
STL                        1217
BLK                        1217
TOV                        1217
EFF                        1217
HOF_Hall of Fame Member    1217
dtype: int64

In [156]:
print(data_df.shape)

(1217, 21)


# Machine Learning Models

### LogReg w/ Oversampling

In [157]:
# ML imports
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
import tensorflow as tf

In [174]:
# create our features
X = data_df.drop(['HOF_Hall of Fame Member'], axis=1)

# create our target
y = data_df['HOF_Hall of Fame Member']

In [175]:
# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({0: 879, 1: 33})

### RandomOverSampler

In [176]:
# import RandomOverSampler
from imblearn.over_sampling import RandomOverSampler

In [180]:
# implement random oversampling
X_resampled, y_resampled = RandomOverSampler(random_state=1).fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 879, 1: 879})

In [181]:
# logistic regression using random oversampled data
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1, solver='liblinear')

In [182]:
# display the confusion matrix
y_pred = model.predict(X_test)

# calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, y_pred)

In [183]:
# display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Not HOF Member", "HOF Member"], 
    columns=["Predicted Not HOF Member", "Predicted HOF Member"])

In [184]:
# RandomOverSample results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted Not HOF Member,Predicted HOF Member
Not HOF Member,256,40
HOF Member,0,9


Accuracy Score : 0.9324324324324325
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.86      1.00      0.93      0.93      0.85       296
          1       0.18      1.00      0.86      0.31      0.93      0.88         9

avg / total       0.98      0.87      1.00      0.91      0.93      0.85       305



### SMOTE

In [185]:
# import SMOTE
from imblearn.over_sampling import SMOTE

In [186]:
# implement SMOTE resampling
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 879, 1: 879})

In [187]:
# logistic regression using SMOTE data
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1, solver='liblinear')

In [189]:
# display the confusion matrix
y_pred = model.predict(X_test)

# calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, y_pred)

In [190]:
# display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Not HOF Member", "HOF Member"], 
    columns=["Predicted Not HOF Member", "Predicted HOF Member"])

In [191]:
# SMOTE results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted Not HOF Member,Predicted HOF Member
Not HOF Member,258,38
HOF Member,0,9


Accuracy Score : 0.9358108108108107
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.87      1.00      0.93      0.93      0.86       296
          1       0.19      1.00      0.87      0.32      0.93      0.88         9

avg / total       0.98      0.88      1.00      0.91      0.93      0.86       305



### SVM SMOTE

In [192]:
# import SVMSMOTE
from imblearn.over_sampling import SVMSMOTE

In [193]:
## implement SVM SMOTE resampling
X_resampled, y_resampled = SVMSMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 879, 1: 596})

In [194]:
# logistic regression using SVM SMOTE data
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1, solver='liblinear')

In [195]:
# calculated the balanced accuracy score
y_pred = model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [196]:
# display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Not HOF Member", "HOF Member"], 
    columns=["Predicted Not HOF Member", "Predicted HOF Member"])

In [197]:
# SVMSMOTE results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced (y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted Not HOF Member,Predicted HOF Member
Not HOF Member,273,23
HOF Member,0,9


Accuracy Score : 0.9611486486486487
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.92      1.00      0.96      0.96      0.92       296
          1       0.28      1.00      0.92      0.44      0.96      0.93         9

avg / total       0.98      0.92      1.00      0.94      0.96      0.92       305



## Decision Tree and Neural Net

### Scale the data

In [198]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Decision Tree Model

In [199]:
# DT imports
from path import Path
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [200]:
# create the decision tree classifier instance.
model = tree.DecisionTreeClassifier()

# fit the model.
model = model.fit(X_train_scaled, y_train)

In [201]:
# DT predictions
predictions = model.predict(X_test_scaled)

# calcuate the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [202]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Not HOF Member", "HOF Member"], 
    columns=["Predicted Not HOF Member", "Predicted HOF Member"])

In [203]:
# DT results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted Not HOF Member,Predicted HOF Member
Not HOF Member,291,5
HOF Member,7,2


Accuracy Score : 0.9606557377049181
Classification Report
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       296
           1       0.29      0.22      0.25         9

    accuracy                           0.96       305
   macro avg       0.63      0.60      0.61       305
weighted avg       0.96      0.96      0.96       305



## Neural Net

In [204]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [207]:
# Neural Net model
# Define the basic neural network model
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=10, activation="relu", input_dim=20))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=100)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
10/10 - 0s - loss: 0.0704 - accuracy: 0.9738 - 70ms/epoch - 7ms/step
Loss: 0.07041539996862411, Accuracy: 0.9737704992294312


## Deep Learning

In [206]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 10
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
10/10 - 0s - loss: 0.0617 - accuracy: 0.9770 - 76ms/epoch - 8ms/step
Loss: 0.0617479532957077, Accuracy: 0.9770491719245911
