### Imports and Loading Data

In [1]:
# imports
import pandas as pd
import psycopg2
import config as c

# import for multiple output per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# connection to database
connection = psycopg2.connect(
    host = c.host,
    port = c.port,
    user = c.user,
    password = c.password,
    database = c.database
    )
cursor=connection.cursor()

In [3]:
# SQL query
data_sql = """
SELECT *
FROM final_dataset;
"""

# load in tables as dataframes
data_df = pd.read_sql(data_sql, con=connection)

### Preprocessing Based on Optimization Results

In [4]:
# get_dummies to encode HallofFameClass column
data_df = pd.get_dummies(data_df, prefix='HOF', columns=['HallOfFameStatus'])
data_df.drop(['HOF_Not Inducted'], axis=1, inplace=True) # remove extra get_dummies column, drop_first no ideal

# drop coulmns/features
data_df.drop(['GUID',
              'Name',
              'HallofFameClass',
              'YearDrafted',
              'TO_YEAR',
              'Years_Played',
              'HOF_Elgibility_Year',
              'Team',
              'College',
              'GP',
              'MIN',
              'idPlayer',
              'BIRTHDATE',
              'POSITION',
              'AGE_ROOKIE_SEASON',
              'Draft_Decade',
              'numBallotsBeforeInduct'], 
             axis=1, inplace=True)

### Fit Model

In [5]:
# ML imports
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression

In [6]:
# create our features
X = data_df.drop(['HOF_Hall of Fame Member'], axis=1)

# create our target
y = data_df['HOF_Hall of Fame Member']

In [7]:
# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
Counter(y_train)

Counter({0: 880, 1: 32})

In [9]:
## implement SVM SMOTE resampling
X_resampled, y_resampled = RandomOverSampler(random_state=2, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 880, 1: 880})

In [10]:
# logistic regression using SVM SMOTE data
model = LogisticRegression(solver='liblinear', random_state=2)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=2, solver='liblinear')

### Save Model

In [11]:
# import 
import pickle

# open file and load model
pickle.dump(model, open('ml_model.sav', 'wb'))