# Optimization Attempts

### Load data from database

In [1]:
# imports
import pandas as pd
import sklearn as skl
import psycopg2
import config as c

# import for multiple output per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# connection to database
connection = psycopg2.connect(
    host = c.host,
    port = c.port,
    user = c.user,
    password = c.password,
    database = c.database
    )
cursor=connection.cursor()

In [77]:
data_sql = """
SELECT *
FROM comprehensive_dataset;
"""

#  load in tables as dataframes
data_df = pd.read_sql(data_sql, con=connection)
data_df.head()

Unnamed: 0,GUID,Name,HallOfFameStatus,HallofFameClass,YearDrafted,TO_YEAR,Years_Played,HOF_Elgibility_Year,Pick,Team,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF
0,MIKENILES1980,Mike Niles,Not Inducted,,1980,1980,1,1985,,,...,0.8,45.9,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4
1,WAYNEROBINSON1980,Wayne Robinson,Not Inducted,,1980,1980,1,1985,31.0,LAL,...,3.0,72.9,1.4,2.2,3.6,1.4,0.6,0.3,1.8,7.8
2,BILLYREID1980,Billy Reid,Not Inducted,,1980,1980,1,1985,182.0,GSW,...,0.7,56.4,0.5,0.6,1.0,1.2,0.6,0.1,1.3,2.8
3,ALEXBRADLEY1981,Alex Bradley,Not Inducted,,1981,1981,1,1986,86.0,NYK,...,1.2,60.4,0.8,0.9,1.7,0.3,0.3,0.1,0.7,3.4
4,GARRYWITTS1981,Garry Witts,Not Inducted,,1981,1981,1,1986,103.0,WSB,...,0.9,82.5,0.6,0.7,1.3,0.8,0.4,0.1,0.8,3.8


In [78]:
data_df.dtypes

GUID                    object
Name                    object
HallOfFameStatus        object
HallofFameClass        float64
YearDrafted              int64
TO_YEAR                  int64
Years_Played             int64
HOF_Elgibility_Year      int64
Pick                   float64
Team                    object
College                 object
GP                       int64
MIN                    float64
PTS                    float64
FGM                    float64
FGA                    float64
FG%                    float64
3P_Made                float64
3PA                    float64
3P%                    float64
FTM                    float64
FTA                    float64
FT%                    float64
OREB                   float64
DREB                   float64
REB                    float64
AST                    float64
STL                    float64
BLK                    float64
TOV                    float64
EFF                    float64
dtype: object

## Preprocessing 

In [79]:
# get_dummies to encode HallofFameClass column
data_df = pd.get_dummies(data_df, prefix='HOF', columns=['HallOfFameStatus'])
data_df.drop(['HOF_Not Inducted'], axis=1, inplace=True) # remove extra get_dummies column
data_df.head()

Unnamed: 0,GUID,Name,HallofFameClass,YearDrafted,TO_YEAR,Years_Played,HOF_Elgibility_Year,Pick,Team,College,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,HOF_Hall of Fame Member
0,MIKENILES1980,Mike Niles,,1980,1980,1,1985,,,,...,45.9,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4,0
1,WAYNEROBINSON1980,Wayne Robinson,,1980,1980,1,1985,31.0,LAL,Virginia Polytechnic Institute and State Unive...,...,72.9,1.4,2.2,3.6,1.4,0.6,0.3,1.8,7.8,0
2,BILLYREID1980,Billy Reid,,1980,1980,1,1985,182.0,GSW,University of San Francisco,...,56.4,0.5,0.6,1.0,1.2,0.6,0.1,1.3,2.8,0
3,ALEXBRADLEY1981,Alex Bradley,,1981,1981,1,1986,86.0,NYK,Villanova University,...,60.4,0.8,0.9,1.7,0.3,0.3,0.1,0.7,3.4,0
4,GARRYWITTS1981,Garry Witts,,1981,1981,1,1986,103.0,WSB,College of the Holy Cross,...,82.5,0.6,0.7,1.3,0.8,0.4,0.1,0.8,3.8,0


In [80]:
# confirm HOF members encoded
print(data_df.loc[data_df['GUID'] == 'MICHAELJORDAN1984'])
# This confirms HOF members = 1, non members = 0

                  GUID            Name  HallofFameClass  YearDrafted  TO_YEAR  \
597  MICHAELJORDAN1984  Michael Jordan           2009.0         1984     2002   

     Years_Played  HOF_Elgibility_Year  Pick Team  \
597            19                 2007   3.0  CHI   

                          College  ...   FT%  OREB  DREB  REB  AST  STL  BLK  \
597  University of North Carolina  ...  84.5   2.0   4.5  6.5  5.9  2.4  0.8   

     TOV   EFF  HOF_Hall of Fame Member  
597  3.5  29.2                        1  

[1 rows x 31 columns]


In [81]:
data_df.dtypes

GUID                        object
Name                        object
HallofFameClass            float64
YearDrafted                  int64
TO_YEAR                      int64
Years_Played                 int64
HOF_Elgibility_Year          int64
Pick                       float64
Team                        object
College                     object
GP                           int64
MIN                        float64
PTS                        float64
FGM                        float64
FGA                        float64
FG%                        float64
3P_Made                    float64
3PA                        float64
3P%                        float64
FTM                        float64
FTA                        float64
FT%                        float64
OREB                       float64
DREB                       float64
REB                        float64
AST                        float64
STL                        float64
BLK                        float64
TOV                 

In [82]:
# check for NaNs
data_df.isnull().values.any()

True

In [83]:
# where are the NaNs?
data_df.count()

GUID                       1217
Name                       1217
HallofFameClass              42
YearDrafted                1217
TO_YEAR                    1217
Years_Played               1217
HOF_Elgibility_Year        1217
Pick                        915
Team                        915
College                     870
GP                         1217
MIN                        1217
PTS                        1217
FGM                        1217
FGA                        1217
FG%                        1217
3P_Made                    1217
3PA                        1217
3P%                        1217
FTM                        1217
FTA                        1217
FT%                        1217
OREB                       1217
DREB                       1217
REB                        1217
AST                        1217
STL                        1217
BLK                        1217
TOV                        1217
EFF                        1217
HOF_Hall of Fame Member    1217
dtype: i

In [84]:
# fill NaNs with 0
data_df['Pick'] = data_df['Pick'].fillna(0)
data_df.count()

GUID                       1217
Name                       1217
HallofFameClass              42
YearDrafted                1217
TO_YEAR                    1217
Years_Played               1217
HOF_Elgibility_Year        1217
Pick                       1217
Team                        915
College                     870
GP                         1217
MIN                        1217
PTS                        1217
FGM                        1217
FGA                        1217
FG%                        1217
3P_Made                    1217
3PA                        1217
3P%                        1217
FTM                        1217
FTA                        1217
FT%                        1217
OREB                       1217
DREB                       1217
REB                        1217
AST                        1217
STL                        1217
BLK                        1217
TOV                        1217
EFF                        1217
HOF_Hall of Fame Member    1217
dtype: i

In [85]:
data_df.head()

Unnamed: 0,GUID,Name,HallofFameClass,YearDrafted,TO_YEAR,Years_Played,HOF_Elgibility_Year,Pick,Team,College,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,HOF_Hall of Fame Member
0,MIKENILES1980,Mike Niles,,1980,1980,1,1985,0.0,,,...,45.9,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4,0
1,WAYNEROBINSON1980,Wayne Robinson,,1980,1980,1,1985,31.0,LAL,Virginia Polytechnic Institute and State Unive...,...,72.9,1.4,2.2,3.6,1.4,0.6,0.3,1.8,7.8,0
2,BILLYREID1980,Billy Reid,,1980,1980,1,1985,182.0,GSW,University of San Francisco,...,56.4,0.5,0.6,1.0,1.2,0.6,0.1,1.3,2.8,0
3,ALEXBRADLEY1981,Alex Bradley,,1981,1981,1,1986,86.0,NYK,Villanova University,...,60.4,0.8,0.9,1.7,0.3,0.3,0.1,0.7,3.4,0
4,GARRYWITTS1981,Garry Witts,,1981,1981,1,1986,103.0,WSB,College of the Holy Cross,...,82.5,0.6,0.7,1.3,0.8,0.4,0.1,0.8,3.8,0


In [86]:
# drop columns that will likely error/confuse the ML models
data_df.drop(['GUID', 'Name', 'Team', 'College', # dropping object columns 
              'HallofFameClass', 'YearDrafted', 'TO_YEAR', 'HOF_Elgibility_Year', 'GP', 'MIN' # non performance stats
                ], axis=1, inplace=True)
data_df.head()

Unnamed: 0,Years_Played,Pick,PTS,FGM,FGA,FG%,3P_Made,3PA,3P%,FTM,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,HOF_Hall of Fame Member
0,1,0.0,2.6,1.1,3.1,34.8,0.0,0.1,50.0,0.4,...,45.9,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4,0
1,1,31.0,7.9,2.9,6.3,46.0,0.0,0.1,0.0,2.2,...,72.9,1.4,2.2,3.6,1.4,0.6,0.3,1.8,7.8,0
2,1,182.0,3.2,1.4,3.1,45.4,0.0,0.1,0.0,0.4,...,56.4,0.5,0.6,1.0,1.2,0.6,0.1,1.3,2.8,0
3,1,86.0,3.5,1.4,2.6,52.4,0.0,0.0,0.0,0.7,...,60.4,0.8,0.9,1.7,0.3,0.3,0.1,0.7,3.4,0
4,1,103.0,2.9,1.1,1.8,58.3,0.0,0.0,50.0,0.7,...,82.5,0.6,0.7,1.3,0.8,0.4,0.1,0.8,3.8,0


In [87]:
#save df for easy loading
data_df.to_csv('experiment.csv', index=False)

In [88]:
# run this cell again to bring in unaltered data
data_df = pd.read_csv('experiment.csv')
data_df.drop(['Unnamed: 0'], axis=1, inplace=True)
data_df.head()

KeyError: "['Unnamed: 0'] not found in axis"

# Machine Learning

In [89]:
# ML imports
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
import tensorflow as tf

In [90]:
# create our features
X = data_df.drop(['HOF_Hall of Fame Member'], axis=1)

# create our target
y = data_df['HOF_Hall of Fame Member']

In [91]:
# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({0: 879, 1: 33})

### SVM SMOTE

In [92]:
# import SVMSMOTE
from imblearn.over_sampling import SVMSMOTE

In [93]:
# running it all for easy copy/paste

# implement SVM SMOTE resampling
X_resampled, y_resampled = SVMSMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

# logistic regression using SVM SMOTE data
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

# calculated the balanced accuracy score
y_pred = model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

# display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Not HOF Member", "HOF Member"], 
    columns=["Predicted Not HOF Member", "Predicted HOF Member"])

# SVMSMOTE results
print('_______BASELINE RESULTS_______\n')
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced (y_test, y_pred))

Counter({0: 879, 1: 596})

LogisticRegression(random_state=1, solver='liblinear')

_______BASELINE RESULTS_______

Confusion Matrix


Unnamed: 0,Predicted Not HOF Member,Predicted HOF Member
Not HOF Member,273,23
HOF Member,0,9


Accuracy Score : 0.9611486486486487
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.92      1.00      0.96      0.96      0.92       296
          1       0.28      1.00      0.92      0.44      0.96      0.93         9

avg / total       0.98      0.92      1.00      0.94      0.96      0.92       305



In [94]:
# run this cell again to bring in unaltered data
data_df = pd.read_csv('experiment.csv')
data_df.drop(['Unnamed: 0'], axis=1, inplace=True)
data_df.head()

KeyError: "['Unnamed: 0'] not found in axis"

In [95]:
# experiemnting with dropping features
# dropping attemtps and percentages, keeping shots made only
data_df.drop(['FGA', 'FG%', '3PA', '3P%', 'FTA', 'FT%'], axis=1, inplace=True)
data_df.head()

Unnamed: 0,Years_Played,Pick,PTS,FGM,3P_Made,FTM,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,HOF_Hall of Fame Member
0,1,0.0,2.6,1.1,0.0,0.4,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4,0
1,1,31.0,7.9,2.9,0.0,2.2,1.4,2.2,3.6,1.4,0.6,0.3,1.8,7.8,0
2,1,182.0,3.2,1.4,0.0,0.4,0.5,0.6,1.0,1.2,0.6,0.1,1.3,2.8,0
3,1,86.0,3.5,1.4,0.0,0.7,0.8,0.9,1.7,0.3,0.3,0.1,0.7,3.4,0
4,1,103.0,2.9,1.1,0.0,0.7,0.6,0.7,1.3,0.8,0.4,0.1,0.8,3.8,0


In [96]:
# create our features
X = data_df.drop(['HOF_Hall of Fame Member'], axis=1)

# create our target
y = data_df['HOF_Hall of Fame Member']

# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

# implement SVM SMOTE resampling
X_resampled, y_resampled = SVMSMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

# logistic regression using SVM SMOTE data
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

# calculated the balanced accuracy score
y_pred = model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

# display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Not HOF Member", "HOF Member"], 
    columns=["Predicted Not HOF Member", "Predicted HOF Member"])

# SVMSMOTE results
print('_______DROPPING ATTEMPTS AND PERCENTAGE RESULTS_______\n')
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced (y_test, y_pred))

Counter({0: 879, 1: 33})

Counter({0: 879, 1: 879})

LogisticRegression(random_state=1, solver='liblinear')

_______DROPPING ATTEMPTS AND PERCENTAGE RESULTS_______

Confusion Matrix


Unnamed: 0,Predicted Not HOF Member,Predicted HOF Member
Not HOF Member,266,30
HOF Member,0,9


Accuracy Score : 0.9493243243243243
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.90      1.00      0.95      0.95      0.89       296
          1       0.23      1.00      0.90      0.38      0.95      0.91         9

avg / total       0.98      0.90      1.00      0.93      0.95      0.89       305



In [97]:
# run this cell again to bring in unaltered data
data_df = pd.read_csv('experiment.csv')
data_df.drop(['Unnamed: 0'], axis=1, inplace=True)

KeyError: "['Unnamed: 0'] not found in axis"

In [98]:
# experiemnting with dropping features
# dropping percentages, keeping shots made and attempts
data_df.drop(['FG%', '3P%', 'FT%'], axis=1, inplace=True)

In [99]:
# create our features
X = data_df.drop(['HOF_Hall of Fame Member'], axis=1)

# create our target
y = data_df['HOF_Hall of Fame Member']

# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

# implement SVM SMOTE resampling
X_resampled, y_resampled = SVMSMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

# logistic regression using SVM SMOTE data
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

# calculated the balanced accuracy score
y_pred = model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

# display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Not HOF Member", "HOF Member"], 
    columns=["Predicted Not HOF Member", "Predicted HOF Member"])

# SVMSMOTE results
print('_______DROPPING PERCENTAGE FEATURES_______\n')
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced (y_test, y_pred))

Counter({0: 879, 1: 33})

Counter({0: 879, 1: 879})

LogisticRegression(random_state=1, solver='liblinear')

_______DROPPING PERCENTAGE FEATURES_______

Confusion Matrix


Unnamed: 0,Predicted Not HOF Member,Predicted HOF Member
Not HOF Member,264,32
HOF Member,0,9


Accuracy Score : 0.9459459459459459
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.89      1.00      0.94      0.94      0.88       296
          1       0.22      1.00      0.89      0.36      0.94      0.90         9

avg / total       0.98      0.90      1.00      0.93      0.94      0.88       305



In [100]:
data_sql = """
SELECT *
FROM comprehensive_dataset;
"""

#  load in tables as dataframes
data_df = pd.read_sql(data_sql, con=connection)
data_df.head()

Unnamed: 0,GUID,Name,HallOfFameStatus,HallofFameClass,YearDrafted,TO_YEAR,Years_Played,HOF_Elgibility_Year,Pick,Team,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF
0,MIKENILES1980,Mike Niles,Not Inducted,,1980,1980,1,1985,,,...,0.8,45.9,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4
1,WAYNEROBINSON1980,Wayne Robinson,Not Inducted,,1980,1980,1,1985,31.0,LAL,...,3.0,72.9,1.4,2.2,3.6,1.4,0.6,0.3,1.8,7.8
2,BILLYREID1980,Billy Reid,Not Inducted,,1980,1980,1,1985,182.0,GSW,...,0.7,56.4,0.5,0.6,1.0,1.2,0.6,0.1,1.3,2.8
3,ALEXBRADLEY1981,Alex Bradley,Not Inducted,,1981,1981,1,1986,86.0,NYK,...,1.2,60.4,0.8,0.9,1.7,0.3,0.3,0.1,0.7,3.4
4,GARRYWITTS1981,Garry Witts,Not Inducted,,1981,1981,1,1986,103.0,WSB,...,0.9,82.5,0.6,0.7,1.3,0.8,0.4,0.1,0.8,3.8


In [101]:
data_df.dtypes

GUID                    object
Name                    object
HallOfFameStatus        object
HallofFameClass        float64
YearDrafted              int64
TO_YEAR                  int64
Years_Played             int64
HOF_Elgibility_Year      int64
Pick                   float64
Team                    object
College                 object
GP                       int64
MIN                    float64
PTS                    float64
FGM                    float64
FGA                    float64
FG%                    float64
3P_Made                float64
3PA                    float64
3P%                    float64
FTM                    float64
FTA                    float64
FT%                    float64
OREB                   float64
DREB                   float64
REB                    float64
AST                    float64
STL                    float64
BLK                    float64
TOV                    float64
EFF                    float64
dtype: object

In [102]:
# get_dummies to encode HallofFameClass column
data_df = pd.get_dummies(data_df, prefix='HOF', columns=['HallOfFameStatus'])
data_df.drop(['HOF_Not Inducted'], axis=1, inplace=True) # remove extra get_dummies column
data_df.head()

Unnamed: 0,GUID,Name,HallofFameClass,YearDrafted,TO_YEAR,Years_Played,HOF_Elgibility_Year,Pick,Team,College,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,HOF_Hall of Fame Member
0,MIKENILES1980,Mike Niles,,1980,1980,1,1985,,,,...,45.9,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4,0
1,WAYNEROBINSON1980,Wayne Robinson,,1980,1980,1,1985,31.0,LAL,Virginia Polytechnic Institute and State Unive...,...,72.9,1.4,2.2,3.6,1.4,0.6,0.3,1.8,7.8,0
2,BILLYREID1980,Billy Reid,,1980,1980,1,1985,182.0,GSW,University of San Francisco,...,56.4,0.5,0.6,1.0,1.2,0.6,0.1,1.3,2.8,0
3,ALEXBRADLEY1981,Alex Bradley,,1981,1981,1,1986,86.0,NYK,Villanova University,...,60.4,0.8,0.9,1.7,0.3,0.3,0.1,0.7,3.4,0
4,GARRYWITTS1981,Garry Witts,,1981,1981,1,1986,103.0,WSB,College of the Holy Cross,...,82.5,0.6,0.7,1.3,0.8,0.4,0.1,0.8,3.8,0


In [103]:
# fill NaNs with 0
data_df['Pick'] = data_df['Pick'].fillna(0)

In [104]:
# drop columns that will likely error/confuse the ML models
# try keeping GP and MIN
data_df.drop(['GUID', 'Name', 'Team', 'College', # dropping object columns 
              'HallofFameClass', 'YearDrafted', 'TO_YEAR', 'HOF_Elgibility_Year' # non performance stats
                ], axis=1, inplace=True)
data_df.head()

Unnamed: 0,Years_Played,Pick,GP,MIN,PTS,FGM,FGA,FG%,3P_Made,3PA,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,HOF_Hall of Fame Member
0,1,0.0,44,5.3,2.6,1.1,3.1,34.8,0.0,0.1,...,45.9,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4,0
1,1,31.0,81,19.7,7.9,2.9,6.3,46.0,0.0,0.1,...,72.9,1.4,2.2,3.6,1.4,0.6,0.3,1.8,7.8,0
2,1,182.0,59,10.1,3.2,1.4,3.1,45.4,0.0,0.1,...,56.4,0.5,0.6,1.0,1.2,0.6,0.1,1.3,2.8,0
3,1,86.0,39,8.5,3.5,1.4,2.6,52.4,0.0,0.0,...,60.4,0.8,0.9,1.7,0.3,0.3,0.1,0.7,3.4,0
4,1,103.0,46,10.7,2.9,1.1,1.8,58.3,0.0,0.0,...,82.5,0.6,0.7,1.3,0.8,0.4,0.1,0.8,3.8,0


In [105]:
# create our features
X = data_df.drop(['HOF_Hall of Fame Member'], axis=1)

# create our target
y = data_df['HOF_Hall of Fame Member']

# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

# implement SVM SMOTE resampling
X_resampled, y_resampled = SVMSMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

# logistic regression using SVM SMOTE data
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

# calculated the balanced accuracy score
y_pred = model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

# display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Not HOF Member", "HOF Member"], 
    columns=["Predicted Not HOF Member", "Predicted HOF Member"])

# SVMSMOTE results
print('_______KEEPING GP AND MIN_______\n')
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced (y_test, y_pred))
# negative impact

Counter({0: 879, 1: 33})

Counter({0: 879, 1: 596})

LogisticRegression(random_state=1, solver='liblinear')

_______KEEPING GP AND MIN_______

Confusion Matrix


Unnamed: 0,Predicted Not HOF Member,Predicted HOF Member
Not HOF Member,268,28
HOF Member,0,9


Accuracy Score : 0.9527027027027026
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.91      1.00      0.95      0.95      0.90       296
          1       0.24      1.00      0.91      0.39      0.95      0.91         9

avg / total       0.98      0.91      1.00      0.93      0.95      0.90       305



In [106]:
# run this cell again to bring in unaltered data
data_df = pd.read_csv('experiment.csv')
data_df.drop(['Unnamed: 0'], axis=1, inplace=True)

KeyError: "['Unnamed: 0'] not found in axis"

In [107]:
# experiemnting with dropping features
# dropping OREB and DREB, keeping REBS
data_df.drop(['OREB', 'DREB'], axis=1, inplace=True)

In [108]:
# create our features
X = data_df.drop(['HOF_Hall of Fame Member'], axis=1)

# create our target
y = data_df['HOF_Hall of Fame Member']

# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

# implement SVM SMOTE resampling
X_resampled, y_resampled = SVMSMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

# logistic regression using SVM SMOTE data
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

# calculated the balanced accuracy score
y_pred = model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

# display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Not HOF Member", "HOF Member"], 
    columns=["Predicted Not HOF Member", "Predicted HOF Member"])

# SVMSMOTE results
print('_______DROPPING DREB AND OREB_______\n')
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced (y_test, y_pred))
# little to no impact, but still lower compared to baseline

Counter({0: 879, 1: 33})

Counter({0: 879, 1: 596})

LogisticRegression(random_state=1, solver='liblinear')

_______DROPPING DREB AND OREB_______

Confusion Matrix


Unnamed: 0,Predicted Not HOF Member,Predicted HOF Member
Not HOF Member,272,24
HOF Member,0,9


Accuracy Score : 0.9594594594594594
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.92      1.00      0.96      0.96      0.91       296
          1       0.27      1.00      0.92      0.43      0.96      0.93         9

avg / total       0.98      0.92      1.00      0.94      0.96      0.91       305

