# Save Model

## Load data from database

In [1]:
# imports
import pandas as pd
import psycopg2
import config as c

# import for multiple output per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# connection to database
connection = psycopg2.connect(
    host = c.host,
    port = c.port,
    user = c.user,
    password = c.password,
    database = c.database
    )
cursor=connection.cursor()

In [3]:
data_sql = """
SELECT *
FROM comprehensive_dataset;
"""

#  load in tables as dataframes
data_df = pd.read_sql(data_sql, con=connection)
data_df.head()

Unnamed: 0,GUID,Name,HallOfFameStatus,HallofFameClass,YearDrafted,TO_YEAR,Years_Played,HOF_Elgibility_Year,Pick,Team,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF
0,MIKENILES1980,Mike Niles,Not Inducted,,1980,1980,1,1985,,,...,0.8,45.9,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4
1,WAYNEROBINSON1980,Wayne Robinson,Not Inducted,,1980,1980,1,1985,31.0,LAL,...,3.0,72.9,1.4,2.2,3.6,1.4,0.6,0.3,1.8,7.8
2,BILLYREID1980,Billy Reid,Not Inducted,,1980,1980,1,1985,182.0,GSW,...,0.7,56.4,0.5,0.6,1.0,1.2,0.6,0.1,1.3,2.8
3,ALEXBRADLEY1981,Alex Bradley,Not Inducted,,1981,1981,1,1986,86.0,NYK,...,1.2,60.4,0.8,0.9,1.7,0.3,0.3,0.1,0.7,3.4
4,GARRYWITTS1981,Garry Witts,Not Inducted,,1981,1981,1,1986,103.0,WSB,...,0.9,82.5,0.6,0.7,1.3,0.8,0.4,0.1,0.8,3.8


## Preprocessing 

In [4]:
# get_dummies to encode HallofFameClass column
data_df = pd.get_dummies(data_df, prefix='HOF', columns=['HallOfFameStatus'])
data_df.drop(['HOF_Not Inducted'], axis=1, inplace=True) # remove extra get_dummies column, drop_first no ideal

# drop columns that will likely error/confuse the ML models
data_df.drop(['GUID', 'Name', 'Team', 'College', # dropping object columns 
              'HallofFameClass', 'YearDrafted', 'TO_YEAR', 'HOF_Elgibility_Year', 'GP', 'MIN' # non performance stats
                ], axis=1, inplace=True)

# fill NaNs with 0
data_df['Pick'] = data_df['Pick'].fillna(0)

data_df.head()
print(data_df.shape)

Unnamed: 0,Years_Played,Pick,PTS,FGM,FGA,FG%,3P_Made,3PA,3P%,FTM,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,HOF_Hall of Fame Member
0,1,0.0,2.6,1.1,3.1,34.8,0.0,0.1,50.0,0.4,...,45.9,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4,0
1,1,31.0,7.9,2.9,6.3,46.0,0.0,0.1,0.0,2.2,...,72.9,1.4,2.2,3.6,1.4,0.6,0.3,1.8,7.8,0
2,1,182.0,3.2,1.4,3.1,45.4,0.0,0.1,0.0,0.4,...,56.4,0.5,0.6,1.0,1.2,0.6,0.1,1.3,2.8,0
3,1,86.0,3.5,1.4,2.6,52.4,0.0,0.0,0.0,0.7,...,60.4,0.8,0.9,1.7,0.3,0.3,0.1,0.7,3.4,0
4,1,103.0,2.9,1.1,1.8,58.3,0.0,0.0,50.0,0.7,...,82.5,0.6,0.7,1.3,0.8,0.4,0.1,0.8,3.8,0


(1217, 21)


In [5]:
# save processed data to database for calling later
#from sqlalchemy import create_engine

# create the engine to connect
#engine = create_engine(f'{c.user}ql+psycopg2://{c.host}:{c.password}@{c.host}:{c.port}/{c.database}')

# save data_df to db
#data_df.to_sql('encoded_dataset', engine)

## Machine Learning Models

### LogReg w/ Oversampling

In [6]:
# ML imports
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SVMSMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

In [7]:
# create our features
X = data_df.drop(['HOF_Hall of Fame Member'], axis=1)

# create our target
y = data_df['HOF_Hall of Fame Member']

In [8]:
# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
Counter(y_train)

Counter({1: 28, 0: 884})

In [9]:
## implement SVM SMOTE resampling
X_resampled, y_resampled = SVMSMOTE(random_state=2, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({1: 431, 0: 884})

In [10]:
# logistic regression using SVM SMOTE data
model = LogisticRegression(solver='liblinear', random_state=2)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=2, solver='liblinear')

In [11]:
# generate predictions on test set
y_pred_test = model.predict(X_test)

# calculated the accuracy scores
test_acc_score = balanced_accuracy_score(y_test, y_pred_test)
train_acc_score = model.score(X_train, y_train)

In [12]:
# define confusion matrix
cm = confusion_matrix(y_test, y_pred_test)

# create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Not HOF Member", "HOF Member"], 
    columns=["Predicted Not HOF Member", "Predicted HOF Member"])

In [13]:
# SVMSMOTE results
print('SVMSMOTE RESULTS\n')
print("CONFUSION MATRIX")
display(cm_df)
print("ACCURACY SCORES\n")
print(f'Training Accuracy: {train_acc_score}\n')
print(f'Testing Accuracy: {test_acc_score}\n')
print("CLASSIFICATION REPORT\n")
print(classification_report_imbalanced(y_test, y_pred_test))

SVMSMOTE RESULTS

CONFUSION MATRIX


Unnamed: 0,Predicted Not HOF Member,Predicted HOF Member
Not HOF Member,273,18
HOF Member,2,12


ACCURACY SCORES

Training Accuracy: 0.9418859649122807

Testing Accuracy: 0.8976435935198821

CLASSIFICATION REPORT

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.94      0.86      0.96      0.90      0.81       291
          1       0.40      0.86      0.94      0.55      0.90      0.80        14

avg / total       0.97      0.93      0.86      0.95      0.90      0.81       305



In [14]:
# Classification Report Explained
# All these are metrics for measuring performance of imbalanced classes.

# pre stands for precision, ratio of TP to sum of TP and FP
# rec stands for recall/sensitivity, ratio of TP to sum of TP and FN
# spe stands for specificity, ratio of how many were correctly classified
# f1 stands for f1 measure,  harmonic mean of pre and rec
# geo stands for geometric mean, square root of the product of rec and spe
# iba stands for index balanced accuracy
# sup stands for support, the number of points per class

## Save Model

In [15]:
# import 
import pickle

In [18]:
# open file and load model
pickle.dump(model, open('ml_model.sav', 'wb'))