### Imports and Preprocessing

In [1]:
# imports
import pandas as pd
import sklearn as skl
import psycopg2
import config as c

# import for multiple output per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# connection to database
connection = psycopg2.connect(
    host = c.host,
    port = c.port,
    user = c.user,
    password = c.password,
    database = c.database
    )
cursor=connection.cursor()

In [3]:
# load in data from database as a dataframe
data_sql = """
SELECT *
FROM comprehensive_dataset;
"""

#  load in tables as dataframes
data_df = pd.read_sql(data_sql, con=connection)

In [4]:
# get_dummies to encode HallofFameClass column
data_df = pd.get_dummies(data_df, prefix='HOF', columns=['HallOfFameStatus'])
data_df.drop(['HOF_Not Inducted'], axis=1, inplace=True) # remove extra get_dummies column, drop_first no ideal

# drop features
data_df.drop(['GUID', 'Name', 'Team', 'College', # dropping object columns 
              
              'HallofFameClass', 'YearDrafted', 'TO_YEAR', 'Years_Played', 
              'HOF_Elgibility_Year', 'GP', 'MIN', 'AGE_ROOKIE_SEASON' # non performance stats
                ], axis=1, inplace=True)

# fill NaNs with 0
data_df = data_df.fillna(0)

data_df.head()
print(data_df.shape)

Unnamed: 0,Pick,PTS,FGM,FGA,FG%,3P_Made,3PA,3P%,FTM,FTA,...,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,numberRound,HOF_Hall of Fame Member
0,0.0,2.6,1.1,3.1,34.8,0.0,0.1,50.0,0.4,0.8,...,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4,0.0,0
1,25.0,3.1,1.3,2.7,47.4,0.0,0.0,0.0,0.6,1.0,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,3.5,1.0,0
2,3.0,14.1,6.2,15.1,41.3,0.4,1.5,24.0,1.3,1.5,...,0.5,1.3,1.8,3.1,0.8,0.1,1.6,9.1,1.0,0
3,11.0,6.4,2.4,6.1,40.3,0.1,0.3,21.1,1.4,2.1,...,0.7,1.2,2.0,0.9,0.6,0.2,1.1,4.6,1.0,0
4,3.0,18.7,6.9,15.2,45.3,0.1,0.3,25.9,4.8,6.5,...,2.7,4.2,6.9,2.2,1.0,1.0,2.8,17.0,1.0,0


(1217, 21)


### Optimization Attempt #1: Dropping Attempts and Percentages

In [5]:
# dropping attemtps and percentages
data_df.drop(['FGA', 'FG%', '3PA', '3P%', 'FTA', 'FT%'], axis=1, inplace=True)
data_df.head()
print(data_df.shape)

Unnamed: 0,Pick,PTS,FGM,3P_Made,FTM,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,numberRound,HOF_Hall of Fame Member
0,0.0,2.6,1.1,0.0,0.4,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4,0.0,0
1,25.0,3.1,1.3,0.0,0.6,0.6,1.4,2.1,0.3,0.1,0.3,0.5,3.5,1.0,0
2,3.0,14.1,6.2,0.4,1.3,0.5,1.3,1.8,3.1,0.8,0.1,1.6,9.1,1.0,0
3,11.0,6.4,2.4,0.1,1.4,0.7,1.2,2.0,0.9,0.6,0.2,1.1,4.6,1.0,0
4,3.0,18.7,6.9,0.1,4.8,2.7,4.2,6.9,2.2,1.0,1.0,2.8,17.0,1.0,0


(1217, 15)


### Machine Learning

In [6]:
# ML imports
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SVMSMOTE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
import tensorflow as tf

In [7]:
# create our features
X = data_df.drop(['HOF_Hall of Fame Member'], axis=1)

# create our target
y = data_df['HOF_Hall of Fame Member']

In [8]:
# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
Counter(y_train)

# implement SVMSMOTE resampling
X_resampled, y_resampled = SVMSMOTE(random_state=2, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

# logistic regression using SVMSMOTE data
model = LogisticRegression(solver='liblinear', random_state=2)
model.fit(X_resampled, y_resampled)

# generate predictions on test set
y_pred_test = model.predict(X_test)

# calculate the accuracy scores
test_acc_score = balanced_accuracy_score(y_test, y_pred_test)
train_acc_score = model.score(X_train, y_train)

# define the confusion matrix
cm = confusion_matrix(y_test, y_pred_test)

# create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Not HOF Member", "HOF Member"], 
    columns=["Predicted Not HOF Member", "Predicted HOF Member"])

# print results
print('_______DROPPING ATTEMPTS AND PERCENTAGE FEATURES_______\n')
print("CONFUSION MATRIX")
display(cm_df)
print("ACCURACY SCORES\n")
print(f'Training Accuracy: {train_acc_score}\n')
print(f'Testing Accuracy: {test_acc_score}\n')
print("CLASSIFICATION REPORT\n")
print(classification_report_imbalanced(y_test, y_pred_test))

Counter({0: 882, 1: 30})

Counter({0: 882, 1: 882})

LogisticRegression(random_state=2, solver='liblinear')

_______DROPPING ATTEMPTS AND PERCENTAGE FEATURES_______

CONFUSION MATRIX


Unnamed: 0,Predicted Not HOF Member,Predicted HOF Member
Not HOF Member,269,24
HOF Member,5,7


ACCURACY SCORES

Training Accuracy: 0.9166666666666666

Testing Accuracy: 0.7507110352673493

CLASSIFICATION REPORT

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.92      0.58      0.95      0.73      0.55       293
          1       0.23      0.58      0.92      0.33      0.73      0.52        12

avg / total       0.95      0.90      0.60      0.92      0.73      0.55       305

