In [19]:
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression


In [20]:
# load data
rookies_df = pd.read_csv('../data/cleaned_nba_hof_rookies.csv')
rookies_df.set_index(rookies_df['name'], inplace=True)
rookies_df.drop(['name'], axis=1, inplace=True)
rookies_df.head()

Unnamed: 0_level_0,hof_class,year_drafted,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A.C. Green,0.0,1985,82,18.8,6.4,2.5,4.7,53.9,0.0,0.1,...,2.0,61.1,2.0,2.7,4.6,0.7,0.6,0.6,1.2,8.7
A.J. English,0.0,1990,70,20.6,8.8,3.6,8.2,43.9,0.0,0.4,...,2.2,70.7,0.9,1.2,2.1,2.5,0.4,0.2,1.6,7.1
A.J. Price,0.0,2009,56,15.4,7.3,2.6,6.3,41.0,1.1,3.1,...,1.3,80.0,0.2,1.4,1.6,1.9,0.6,0.1,1.1,6.4
Aaron Brooks,0.0,2007,51,11.9,5.2,1.8,4.4,41.3,0.7,2.1,...,1.0,85.7,0.3,0.8,1.1,1.7,0.3,0.1,0.9,4.7
Aaron Gordon,0.0,2014,47,17.0,5.2,2.0,4.4,44.7,0.3,1.0,...,1.3,72.1,1.0,2.6,3.6,0.7,0.5,0.5,0.8,6.8


In [22]:
# Create our features
X = rookies_df.drop('hof_class', axis=1)

# Create our target
y = rookies_df['hof_class']

In [23]:
# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({0.0: 1121, 1.0: 31})

# RandomOverSampler

In [24]:
# implement random oversampling
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0.0: 1121, 1.0: 1121})

In [25]:
# Logistic regression using random oversampled data
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1, solver='liblinear')

In [26]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[333,  40],
       [  2,   9]], dtype=int64)

In [27]:
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.8554716061418475

In [28]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.99      0.89      0.82      0.94      0.85      0.74       373
        1.0       0.18      0.82      0.89      0.30      0.85      0.72        11

avg / total       0.97      0.89      0.82      0.92      0.85      0.74       384



# SMOTE

In [32]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0.0: 1121, 1.0: 1121})

In [33]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1, solver='liblinear')

In [34]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8207409212771144

In [35]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[341,  32],
       [  3,   8]], dtype=int64)

In [36]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.99      0.91      0.73      0.95      0.82      0.68       373
        1.0       0.20      0.73      0.91      0.31      0.82      0.65        11

avg / total       0.97      0.91      0.73      0.93      0.82      0.68       384



# SVM SMOTE

In [42]:
from imblearn.over_sampling import SVMSMOTE
X_resampled, y_resampled = SVMSMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0.0: 1121, 1.0: 1121})

In [43]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1, solver='liblinear')

In [44]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6068730197416524

In [45]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[351,  22],
       [  8,   3]], dtype=int64)

In [46]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.98      0.94      0.27      0.96      0.51      0.27       373
        1.0       0.12      0.27      0.94      0.17      0.51      0.24        11

avg / total       0.95      0.92      0.29      0.94      0.51      0.27       384

