# Module 20 First Segment Project Deliverable

## Load data from database

In [1]:
# imports
import psycopg2
import config as c
import pandas as pd

In [2]:
# connection to database
connection = psycopg2.connect(
    host = c.host,
    port = c.port,
    user = c.user,
    password = c.password,
    database = c.database
    )
cursor=connection.cursor()

In [3]:
# execute SQL queries
sql = """
SELECT *
FROM rookies_df
"""

rookies_df = pd.read_sql(sql, con=connection)
rookies_df.head()

Unnamed: 0,index,name,halloffameclass,year_drafted,gp,min,pts,fgm,fga,FG%,...,fta,FT%,oreb,dreb,reb,ast,stl,blk,tov,eff
0,0,A.C. Green,Not Inducted,1985,82,18.8,6.4,2.5,4.7,53.9,...,2.0,61.1,2.0,2.7,4.6,0.7,0.6,0.6,1.2,8.7
1,1,A.J. English,Not Inducted,1990,70,20.6,8.8,3.6,8.2,43.9,...,2.2,70.7,0.9,1.2,2.1,2.5,0.4,0.2,1.6,7.1
2,2,A.J. Price,Not Inducted,2009,56,15.4,7.3,2.6,6.3,41.0,...,1.3,80.0,0.2,1.4,1.6,1.9,0.6,0.1,1.1,6.4
3,3,Aaron Brooks,Not Inducted,2007,51,11.9,5.2,1.8,4.4,41.3,...,1.0,85.7,0.3,0.8,1.1,1.7,0.3,0.1,0.9,4.7
4,4,Aaron Gordon,Not Inducted,2014,47,17.0,5.2,2.0,4.4,44.7,...,1.3,72.1,1.0,2.6,3.6,0.7,0.5,0.5,0.8,6.8


In [4]:
# check types
rookies_df.dtypes

index                int64
name                object
halloffameclass     object
year_drafted         int64
gp                   int64
min                float64
pts                float64
fgm                float64
fga                float64
FG%                float64
3PMade             float64
3PA                float64
3P%                float64
ftm                float64
fta                float64
FT%                float64
oreb               float64
dreb               float64
reb                float64
ast                float64
stl                float64
blk                float64
tov                float64
eff                float64
dtype: object

## Preprocessing 

In [5]:
# get_dummies to convert halloffameclass column
rookies_df = pd.get_dummies(rookies_df, columns=['halloffameclass'])
rookies_df.head()

Unnamed: 0,index,name,year_drafted,gp,min,pts,fgm,fga,FG%,3PMade,...,oreb,dreb,reb,ast,stl,blk,tov,eff,halloffameclass_Hall of Fame Member,halloffameclass_Not Inducted
0,0,A.C. Green,1985,82,18.8,6.4,2.5,4.7,53.9,0.0,...,2.0,2.7,4.6,0.7,0.6,0.6,1.2,8.7,0,1
1,1,A.J. English,1990,70,20.6,8.8,3.6,8.2,43.9,0.0,...,0.9,1.2,2.1,2.5,0.4,0.2,1.6,7.1,0,1
2,2,A.J. Price,2009,56,15.4,7.3,2.6,6.3,41.0,1.1,...,0.2,1.4,1.6,1.9,0.6,0.1,1.1,6.4,0,1
3,3,Aaron Brooks,2007,51,11.9,5.2,1.8,4.4,41.3,0.7,...,0.3,0.8,1.1,1.7,0.3,0.1,0.9,4.7,0,1
4,4,Aaron Gordon,2014,47,17.0,5.2,2.0,4.4,44.7,0.3,...,1.0,2.6,3.6,0.7,0.5,0.5,0.8,6.8,0,1


In [6]:
# rename some columns for preference
rookies_df.rename(columns={'Name':'name',
                           'halloffameclass_Hall of Fame Member':'inducted',
                           'halloffameclass_Not Inducted':'not_inducted',
                           'Year Drafted':'year_drafted',
                           '3P Made':'3PM'},
                  inplace=True)
rookies_df.head()

Unnamed: 0,index,name,year_drafted,gp,min,pts,fgm,fga,FG%,3PMade,...,oreb,dreb,reb,ast,stl,blk,tov,eff,inducted,not_inducted
0,0,A.C. Green,1985,82,18.8,6.4,2.5,4.7,53.9,0.0,...,2.0,2.7,4.6,0.7,0.6,0.6,1.2,8.7,0,1
1,1,A.J. English,1990,70,20.6,8.8,3.6,8.2,43.9,0.0,...,0.9,1.2,2.1,2.5,0.4,0.2,1.6,7.1,0,1
2,2,A.J. Price,2009,56,15.4,7.3,2.6,6.3,41.0,1.1,...,0.2,1.4,1.6,1.9,0.6,0.1,1.1,6.4,0,1
3,3,Aaron Brooks,2007,51,11.9,5.2,1.8,4.4,41.3,0.7,...,0.3,0.8,1.1,1.7,0.3,0.1,0.9,4.7,0,1
4,4,Aaron Gordon,2014,47,17.0,5.2,2.0,4.4,44.7,0.3,...,1.0,2.6,3.6,0.7,0.5,0.5,0.8,6.8,0,1


In [7]:
# make player names the index and drop name column
rookies_df.set_index(rookies_df['name'], inplace=True)
rookies_df.drop(['name', 'index'], axis=1, inplace=True)
rookies_df.head()

Unnamed: 0_level_0,year_drafted,gp,min,pts,fgm,fga,FG%,3PMade,3PA,3P%,...,oreb,dreb,reb,ast,stl,blk,tov,eff,inducted,not_inducted
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A.C. Green,1985,82,18.8,6.4,2.5,4.7,53.9,0.0,0.1,16.7,...,2.0,2.7,4.6,0.7,0.6,0.6,1.2,8.7,0,1
A.J. English,1990,70,20.6,8.8,3.6,8.2,43.9,0.0,0.4,9.7,...,0.9,1.2,2.1,2.5,0.4,0.2,1.6,7.1,0,1
A.J. Price,2009,56,15.4,7.3,2.6,6.3,41.0,1.1,3.1,34.5,...,0.2,1.4,1.6,1.9,0.6,0.1,1.1,6.4,0,1
Aaron Brooks,2007,51,11.9,5.2,1.8,4.4,41.3,0.7,2.1,33.0,...,0.3,0.8,1.1,1.7,0.3,0.1,0.9,4.7,0,1
Aaron Gordon,2014,47,17.0,5.2,2.0,4.4,44.7,0.3,1.0,27.1,...,1.0,2.6,3.6,0.7,0.5,0.5,0.8,6.8,0,1


In [9]:
# drop columns that will likely confuse the ML models
rookies_df.drop(['year_drafted', 'gp'], axis=1, inplace=True)
rookies_df.head()

Unnamed: 0_level_0,min,pts,fgm,fga,FG%,3PMade,3PA,3P%,ftm,fta,...,oreb,dreb,reb,ast,stl,blk,tov,eff,inducted,not_inducted
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A.C. Green,18.8,6.4,2.5,4.7,53.9,0.0,0.1,16.7,1.2,2.0,...,2.0,2.7,4.6,0.7,0.6,0.6,1.2,8.7,0,1
A.J. English,20.6,8.8,3.6,8.2,43.9,0.0,0.4,9.7,1.6,2.2,...,0.9,1.2,2.1,2.5,0.4,0.2,1.6,7.1,0,1
A.J. Price,15.4,7.3,2.6,6.3,41.0,1.1,3.1,34.5,1.1,1.3,...,0.2,1.4,1.6,1.9,0.6,0.1,1.1,6.4,0,1
Aaron Brooks,11.9,5.2,1.8,4.4,41.3,0.7,2.1,33.0,0.8,1.0,...,0.3,0.8,1.1,1.7,0.3,0.1,0.9,4.7,0,1
Aaron Gordon,17.0,5.2,2.0,4.4,44.7,0.3,1.0,27.1,0.9,1.3,...,1.0,2.6,3.6,0.7,0.5,0.5,0.8,6.8,0,1


# Machine Learning Models

In [10]:
# ML imports
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

In [11]:
# create our features
X = rookies_df.drop(['inducted', 'not_inducted'], axis=1)

# create our target
y = rookies_df['inducted']

In [12]:
# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({0: 1119, 1: 33})

### RandomOverSampler

In [13]:
# import RandomOverSampler
from imblearn.over_sampling import RandomOverSampler

In [14]:
# implement random oversampling
X_resampled, y_resampled = RandomOverSampler(random_state=1).fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 1119, 1: 1119})

In [15]:
# logistic regression using random oversampled data
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1, solver='liblinear')

In [16]:
# display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[338,  37],
       [  2,   8]], dtype=int64)

In [17]:
# calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.8506666666666667

In [18]:
# print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.90      0.80      0.95      0.85      0.73       375
          1       0.18      0.80      0.90      0.29      0.85      0.71        10

avg / total       0.97      0.90      0.80      0.93      0.85      0.73       385



### SMOTE

In [19]:
# import SMOTE
from imblearn.over_sampling import SMOTE

In [20]:
# implement SMOTE resampling
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 1119, 1: 1119})

In [21]:
# logistic regression using SMOTE data
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1, solver='liblinear')

In [22]:
# calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9046666666666667

In [23]:
# display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[341,  34],
       [  1,   9]], dtype=int64)

In [24]:
# print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.91      0.90      0.95      0.90      0.82       375
          1       0.21      0.90      0.91      0.34      0.90      0.82        10

avg / total       0.98      0.91      0.90      0.94      0.90      0.82       385



### SVM SMOTE

In [25]:
# import SVMSMOTE
from imblearn.over_sampling import SVMSMOTE

In [26]:
## implement SVM SMOTE resampling
X_resampled, y_resampled = SVMSMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 1119, 1: 755})

In [27]:
# logistic regression using SVM SMOTE data
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1, solver='liblinear')

In [28]:
# calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.696

In [29]:
# display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[372,   3],
       [  6,   4]], dtype=int64)

In [30]:
# print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.99      0.40      0.99      0.63      0.42       375
          1       0.57      0.40      0.99      0.47      0.63      0.37        10

avg / total       0.97      0.98      0.42      0.97      0.63      0.42       385

