In [1]:
# imports
import pandas as pd
import sklearn as skl
import psycopg2
import config as c

# import for multiple output per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# connection to database
connection = psycopg2.connect(
    host = c.host,
    port = c.port,
    user = c.user,
    password = c.password,
    database = c.database
    )
cursor=connection.cursor()

In [3]:
data_sql = """
SELECT *
FROM comprehensive_dataset;
"""

#  load in tables as dataframes
data_df = pd.read_sql(data_sql, con=connection)
data_df.head()

Unnamed: 0,GUID,Name,HallOfFameStatus,HallofFameClass,YearDrafted,TO_YEAR,Years_Played,HOF_Elgibility_Year,Pick,Team,...,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,numberRound,AGE_ROOKIE_SEASON
0,MIKENILES1980,Mike Niles,Not Inducted,,1980,1980,1,1985,,,...,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4,,
1,ALAAABDELNABY1990,Alaa Abdelnaby,Not Inducted,,1990,1994,5,1999,25.0,POR,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,3.5,1.0,22.0
2,MAHMOUDABDULRAUF1990,Mahmoud Abdul-Rauf,Not Inducted,,1990,2000,11,2005,3.0,DEN,...,0.5,1.3,1.8,3.1,0.8,0.1,1.6,9.1,1.0,21.0
3,TARIQABDULWAHAD1997,Tariq Abdul-Wahad,Not Inducted,,1997,2003,7,2008,11.0,SAC,...,0.7,1.2,2.0,0.9,0.6,0.2,1.1,4.6,1.0,22.0
4,SHAREEFABDURRAHIM1996,Shareef Abdur-Rahim,Not Inducted,,1996,2007,12,2012,3.0,VAN,...,2.7,4.2,6.9,2.2,1.0,1.0,2.8,17.0,1.0,19.0


## Preprocessing 

In [4]:
# get_dummies to encode HallofFameClass column
data_df = pd.get_dummies(data_df, prefix='HOF', columns=['HallOfFameStatus'])
data_df.drop(['HOF_Not Inducted'], axis=1, inplace=True) # remove extra get_dummies column, drop_first no ideal

# drop features
data_df.drop(['GUID', 'Name', 'Team', 'College', # dropping object columns 
              
              'HallofFameClass', 'YearDrafted', 'TO_YEAR', 'Years_Played', 
              'HOF_Elgibility_Year', 'GP', 'MIN', 'AGE_ROOKIE_SEASON' # non performance stats
                ], axis=1, inplace=True)

# fill NaNs with 0
data_df = data_df.fillna(0)

data_df.head()
print(data_df.shape)

Unnamed: 0,Pick,PTS,FGM,FGA,FG%,3P_Made,3PA,3P%,FTM,FTA,...,OREB,DREB,REB,AST,STL,BLK,TOV,EFF,numberRound,HOF_Hall of Fame Member
0,0.0,2.6,1.1,3.1,34.8,0.0,0.1,50.0,0.4,0.8,...,0.6,0.7,1.3,0.3,0.2,0.0,0.6,1.4,0.0,0
1,25.0,3.1,1.3,2.7,47.4,0.0,0.0,0.0,0.6,1.0,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,3.5,1.0,0
2,3.0,14.1,6.2,15.1,41.3,0.4,1.5,24.0,1.3,1.5,...,0.5,1.3,1.8,3.1,0.8,0.1,1.6,9.1,1.0,0
3,11.0,6.4,2.4,6.1,40.3,0.1,0.3,21.1,1.4,2.1,...,0.7,1.2,2.0,0.9,0.6,0.2,1.1,4.6,1.0,0
4,3.0,18.7,6.9,15.2,45.3,0.1,0.3,25.9,4.8,6.5,...,2.7,4.2,6.9,2.2,1.0,1.0,2.8,17.0,1.0,0


(1217, 21)


### Decision Tree Model, Not Scaled

In [5]:
# create our features
X = data_df.drop(['HOF_Hall of Fame Member'], axis=1)

# create our target
y = data_df['HOF_Hall of Fame Member']

In [6]:
# ML imports
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [7]:
# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
Counter(y_train)

Counter({0: 882, 1: 30})

In [8]:
# create the decision tree classifier instance.
model = tree.DecisionTreeClassifier()

# fit the model.
model = model.fit(X_train, y_train)

In [9]:
# DT predictions
predictions = model.predict(X_test)

# calcuate the accuracy score.
test_acc_score = accuracy_score(y_test, predictions)
train_acc_score = model.score(X_train, y_train)

In [10]:
# define confusion matrix
cm = confusion_matrix(y_test, predictions)

# create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Not HOF Member", "HOF Member"], 
    columns=["Predicted Not HOF Member", "Predicted HOF Member"])

In [11]:
# DT results
print('DECISION TREE NOT SCALED RESULTS\n')
print("CONFUSION MATRIX")
display(cm_df)
print("ACCURACY SCORES\n")
print(f'Training Accuracy: {train_acc_score}\n')
print(f'Testing Accuracy: {test_acc_score}\n')
print("CLASSIFICATION REPORT\n")
print(classification_report(y_test, predictions))

DECISION TREE NOT SCALED RESULTS

CONFUSION MATRIX


Unnamed: 0,Predicted Not HOF Member,Predicted HOF Member
Not HOF Member,283,10
HOF Member,9,3


ACCURACY SCORES

Training Accuracy: 1.0

Testing Accuracy: 0.9377049180327869

CLASSIFICATION REPORT

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       293
           1       0.23      0.25      0.24        12

    accuracy                           0.94       305
   macro avg       0.60      0.61      0.60       305
weighted avg       0.94      0.94      0.94       305

