In [42]:
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [15]:
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
raw_path = os.path.join(BASE_DIR, "data", "raw", "500hits.csv")
df = pd.read_csv(raw_path, encoding="latin1")

In [23]:
df.head()

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA,HOF
0,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,0.366,1
1,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,0.331,1
2,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,0.345,1
3,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,0.31,1
4,21,2792,10430,1736,3430,640,252,101,0,963,327,722,0.329,1


In [24]:
X = df.iloc[:,0:13]
y = df.iloc[:,13]

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 17, test_size = 0.2)

In [61]:
dtc = DecisionTreeClassifier()
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [62]:
dtc.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [63]:
y_pred = dtc.predict(X_test)
y_pred

array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0])

In [64]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[53  8]
 [10 22]]
              precision    recall  f1-score   support

           0       0.84      0.87      0.85        61
           1       0.73      0.69      0.71        32

    accuracy                           0.81        93
   macro avg       0.79      0.78      0.78        93
weighted avg       0.80      0.81      0.80        93



In [65]:
dtc.feature_importances_  # tells each columns and their importances

array([0.04159581, 0.02742913, 0.03380506, 0.05593063, 0.38574252,
       0.05755522, 0.06136489, 0.        , 0.0935555 , 0.05944383,
       0.03607936, 0.03142058, 0.11607746])

In [69]:
X.columns

Index(['YRS', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'SO', 'SB',
       'BA'],
      dtype='object')

In [66]:
features = pd.DataFrame(dtc.feature_importances_, index = X.columns)
features.head(20)

Unnamed: 0,0
YRS,0.041596
G,0.027429
AB,0.033805
R,0.055931
H,0.385743
2B,0.057555
3B,0.061365
HR,0.0
RBI,0.093556
BB,0.059444


In [67]:
dtc2 = DecisionTreeClassifier(criterion = 'entropy', ccp_alpha = 0.04) # ccp_alpha helps when the model is overfitting
dtc2.fit(X_train, y_train)

0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [70]:
y_pred2 = dtc2.predict(X_test)
print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))

[[50 11]
 [ 9 23]]
              precision    recall  f1-score   support

           0       0.85      0.82      0.83        61
           1       0.68      0.72      0.70        32

    accuracy                           0.78        93
   macro avg       0.76      0.77      0.77        93
weighted avg       0.79      0.78      0.79        93



In [75]:
features2 = pd.DataFrame(dtc2.feature_importances_, index = X.columns)
features2

Unnamed: 0,0
YRS,0.0
G,0.0
AB,0.0
R,0.0
H,0.837977
2B,0.0
3B,0.0
HR,0.0
RBI,0.0
BB,0.0
