### Imports and Loading Data

In [1]:
# imports
import pandas as pd
import psycopg2
import config as c

In [2]:
# connection to database
connection = psycopg2.connect(
    host = c.host,
    port = c.port,
    user = c.user,
    password = c.password,
    database = c.database
    )
cursor=connection.cursor()

In [None]:
# SQL query
data_sql = """
SELECT *
FROM final_dataset;
"""

# load in tables as dataframes
data_df = pd.read_sql(data_sql, con=connection)

### Preprocessing Based on Optimization Results

In [None]:
# get_dummies to encode HallofFameClass column
data_df = pd.get_dummies(data_df, prefix='HOF', columns=['HallOfFameStatus'])
data_df.drop(['HOF_Not Inducted'], axis=1, inplace=True) # remove extra get_dummies column, drop_first no ideal

# drop coulmns/features
data_df.drop(['GUID',
              'Name',
              'HallofFameClass',
              'YearDrafted',
              'TO_YEAR',
              'Years_Played',
              'HOF_Elgibility_Year',
              'Team',
              'College',
              'GP',
              'MIN',
              'idPlayer',
              'BIRTHDATE',
              'POSITION',
              'AGE_ROOKIE_SEASON',
              'Draft_Decade',
              'numBallotsBeforeInduct'], 
             axis=1, inplace=True)

### Fit Model

In [None]:
# ML imports
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression

In [None]:
# create our features
X = data_df.drop(['HOF_Hall of Fame Member'], axis=1)

# create our target
y = data_df['HOF_Hall of Fame Member']

In [None]:
# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
Counter(y_train)

In [None]:
## implement RandomOverSampler resampling
X_resampled, y_resampled = RandomOverSampler(random_state=2, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

In [None]:
# logistic regression using RandomOverSampler data
model = LogisticRegression(solver='liblinear', random_state=2)
model.fit(X_resampled, y_resampled)

### ROC Plot

In [None]:
# import
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

# defintions
predictions = model.predict_proba(X_test)
preds = predictions[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# generate plot
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

# source
# https://stackoverflow.com/questions/25009284/how-to-plot-roc-curve-in-python