In [1]:
# imports
import pandas as pd
import sklearn as skl
import psycopg2
import config as c

# import for multiple output per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# connection to database
connection = psycopg2.connect(
    host = c.host,
    port = c.port,
    user = c.user,
    password = c.password,
    database = c.database
    )
cursor=connection.cursor()

In [3]:
# SQL query
data_sql = """
SELECT *
FROM final_dataset;
"""

# load in tables as dataframes
data_df = pd.read_sql(data_sql, con=connection)
data_df.head()

Unnamed: 0,GUID,Name,HallOfFameStatus,HallofFameClass,YearDrafted,TO_YEAR,Years_Played,HOF_Elgibility_Year,Pick,Team,...,BLK,TOV,EFF,idPlayer,numberRound,BIRTHDATE,POSITION,AGE_ROOKIE_SEASON,Draft_Decade,numBallotsBeforeInduct
0,MITCHMCGARY2014,Mitch McGary,Not Inducted,,2014,2015,2,2020,21,OKC,...,0.5,1.0,9.0,203956.0,1,1992-06-06,Forward,22.36,2010s,
1,PJHAIRSTON2014,PJ Hairston,Not Inducted,,2014,2015,2,2020,26,MIA,...,0.3,0.5,4.3,203798.0,1,1992-12-24,Forward,21.81,2010s,
2,KJMCDANIELS2014,KJ McDaniels,Not Inducted,,2014,2016,3,2021,32,PHI,...,1.1,1.7,7.6,203909.0,2,1993-02-09,Guard,21.68,2010s,
3,CLEANTHONYEARLY2014,Cleanthony Early,Not Inducted,,2014,2015,2,2020,34,NYK,...,0.3,1.0,4.8,203921.0,2,1991-04-17,Forward,23.5,2010s,
4,CORYJEFFERSON2014,Cory Jefferson,Not Inducted,,2014,2015,2,2020,60,SAS,...,0.4,0.4,4.8,203928.0,2,1990-12-26,Forward,23.8,2010s,


### Preprocessing 

In [4]:
# get_dummies to encode HallofFameClass column
data_df = pd.get_dummies(data_df, prefix='HOF', columns=['HallOfFameStatus'])
data_df.drop(['HOF_Not Inducted'], axis=1, inplace=True) # remove extra get_dummies column, drop_first no ideal

# drop coulmns/features
data_df.drop(['GUID',
              'Name',
              'HallofFameClass',
              'YearDrafted',
              'TO_YEAR',
              'Years_Played',
              'HOF_Elgibility_Year',
              'Team',
              'College',
              'GP',
              'MIN',
              'idPlayer',
              'BIRTHDATE',
              'POSITION',
              'AGE_ROOKIE_SEASON',
              'Draft_Decade',
              'numBallotsBeforeInduct'], 
             axis=1, inplace=True)

data_df.head()
print(data_df.shape)

Unnamed: 0,Pick,PTS,FGM,FGA,FG%,3P_Made,3PA,3P%,FTM,FTA,...,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,numberRound,HOF_Hall of Fame Member
0,21,6.3,2.8,5.2,53.3,0.0,0.1,0.0,0.8,1.3,...,1.7,3.5,5.2,0.4,0.5,0.5,1.0,9.0,1,0
1,26,5.6,1.9,6.0,32.3,1.1,3.6,30.1,0.7,0.8,...,0.5,1.6,2.0,0.5,0.5,0.3,0.5,4.3,1,0
2,32,7.9,2.8,7.0,39.6,0.7,2.4,28.7,1.6,2.1,...,1.0,2.2,3.2,1.2,0.7,1.1,1.7,7.6,2,0
3,34,5.4,1.9,5.5,35.5,0.6,2.2,26.2,0.9,1.2,...,0.6,1.9,2.5,0.9,0.6,0.3,1.0,4.8,2,0
4,60,3.7,1.5,3.3,44.9,0.0,0.3,13.3,0.6,1.1,...,0.9,2.0,2.9,0.3,0.2,0.4,0.4,4.8,2,0


(1217, 21)


### Decision Tree Not Scaled

In [5]:
# create our features
X = data_df.drop(['HOF_Hall of Fame Member'], axis=1)

# create our target
y = data_df['HOF_Hall of Fame Member']

In [6]:
# ML imports
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [7]:
# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
Counter(y_train)

Counter({0: 880, 1: 32})

In [8]:
# create the decision tree classifier instance.
model = tree.DecisionTreeClassifier()

# fit the model.
model = model.fit(X_train, y_train)

In [9]:
# DT predictions
predictions = model.predict(X_test)

# calcuate the accuracy score.
test_acc_score = accuracy_score(y_test, predictions)
train_acc_score = model.score(X_train, y_train)

In [10]:
# define confusion matrix
cm = confusion_matrix(y_test, predictions)

# create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Not HOF Member", "HOF Member"], 
    columns=["Predicted Not HOF Member", "Predicted HOF Member"])

In [11]:
# DT results
print('DECISION TREE NOT SCALED RESULTS\n')
print("CONFUSION MATRIX")
display(cm_df)
print("ACCURACY SCORES\n")
print(f'Training Accuracy: {train_acc_score}\n')
print(f'Testing Accuracy: {test_acc_score}\n')
print("CLASSIFICATION REPORT\n")
print(classification_report(y_test, predictions))

DECISION TREE NOT SCALED RESULTS

CONFUSION MATRIX


Unnamed: 0,Predicted Not HOF Member,Predicted HOF Member
Not HOF Member,276,19
HOF Member,7,3


ACCURACY SCORES

Training Accuracy: 1.0

Testing Accuracy: 0.9147540983606557

CLASSIFICATION REPORT

              precision    recall  f1-score   support

           0       0.98      0.94      0.96       295
           1       0.14      0.30      0.19        10

    accuracy                           0.91       305
   macro avg       0.56      0.62      0.57       305
weighted avg       0.95      0.91      0.93       305

