Decision Tree on classification datasets 

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sb 
import matplotlib.pyplot as plt 

from sklearn import tree 
from sklearn.tree import DecisionTreeClassifier, plot_tree 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, confusion_matrix 

import nbformat 
from IPython import get_ipython

# from dtreeviz import dtreeviz, model  
# import graphviz 
# from IPython.display import Image 
# import pydotplus 

In [None]:
# %run "../Data_Preprocessing/data_preprocess_dtcls.ipynb" 

with open("../Data_Preprocessing/data_preprocess_dtcls.ipynb", "r", encoding="utf-8") as f:
    ntb = nbformat.read(f, as_version = 4) 

ipython = get_ipython() 

for cell in ntb.cells:
    if cell.cell_type == "code":
        print(cell.source) 

        if ("mov_cls_cleaned" in cell.source or "mov_cls" in cell.source):
            ipython.run_cell(cell.source, silent=True) 
            # ipython.run_cell_async(cell.source, silent=True) 

try:
    print("Movies Clean Data : ")
    print(mov_cls_cleaned.head())   # type: ignore 

except NameError as e:
    print(f"Variable not found: {e}")

In [None]:
mov_cls_cleaned     # type: ignore 

In [None]:
mov_cls_cleaned.corr()      # type: ignore 

Variable split (X,y) : 

In [None]:
X = mov_cls_cleaned.loc[:, mov_cls_cleaned.columns != 'Start_Tech_Oscar']      # type: ignore 
X 

In [None]:
print(type(X)) 

In [None]:
X.shape 

In [None]:
y = mov_cls_cleaned['Start_Tech_Oscar']      # type: ignore 
y 

In [None]:
print(type(y)) 

In [None]:
y.shape 

Test - Train Split : 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 

print(f" X_train size : {X_train.shape} \n X_test size : {X_test.shape} \n y_train size : {y_train.shape} \n y_test size : {y_test.shape}") 

print(f"X_train :\n{X_train}") 
print(f"X_test :\n{X_test}") 
print(f"y_train :\n{y_train}") 
print(f"y_test :\n{y_test}") 

In [None]:
X_train = pd.DataFrame(X_train, columns=list(X.columns))  
X_train 

In [None]:
y_train = pd.Series(y_train) 
y_train 

Training regression tree - 

In [None]:
# clstree = tree.DecisionTreeClassifier(max_depth = 5)   # viable results uptil depth level 5 
# clstree = tree.DecisionTreeClassifier(criterion='entropy', max_depth = 5)   
clstree = tree.DecisionTreeClassifier(criterion='gini', max_depth = 5)   
clstree.fit(X_train, y_train) 

y_train_prd = clstree.predict(X_train) 
y_test_prd = clstree.predict(X_test)  


In [None]:
y_train_prd 

In [None]:
y_test_prd 

Model performance 

In [None]:
acc_sc_trn = accuracy_score(y_train, y_train_prd) 
print(f"Train Accuracy Score : {acc_sc_trn}") 

acc_sc_tst = accuracy_score(y_test, y_test_prd) 
print(f"Test Accuracy Score : {acc_sc_tst}") 

In [None]:
conf_mtx_trn = confusion_matrix(y_train, y_train_prd) 
print(f"Train Confusion Matrix : \n{conf_mtx_trn}") 

conf_mtx_tst = confusion_matrix(y_test, y_test_prd) 
print(f"Test Confusion Matrix : \n{conf_mtx_tst}") 

Plotting decision Tree - 

In [None]:
plt.figure(figsize=(20, 10)) 
plot_tree(clstree, filled=True, feature_names=X_train.columns, rounded=True) 
plt.show() 


Controlling Tree Growth - 

In [None]:
# Minimum observations at leaf node 

# clstree1 = DecisionTreeClassifier(min_samples_leaf = 40)  
clstree1 = DecisionTreeClassifier(criterion='gini', min_samples_leaf = 30, max_depth = 5)   
clstree1.fit(X_train, y_train) 

plt.figure(figsize=(20, 10)) 
plot_tree(clstree1, filled=True, feature_names=X_train.columns, rounded=True) 
plt.show() 

In [None]:
acc_sc_trn = accuracy_score(y_train, clstree1.predict(X_train)) 
print(f"Train Accuracy Score : {acc_sc_trn}") 

acc_sc_tst = accuracy_score(y_test,clstree1.predict(X_test)) 
print(f"Test Accuracy Score : {acc_sc_tst}") 

In [None]:
# Minimum observations at internal node 

# clstree2 = DecisionTreeClassifier(min_samples_split = 40) 
# clstree2 = DecisionTreeClassifier(criterion='entropy', min_samples_split = 50) 

# clstree2 = DecisionTreeClassifier(criterion='gini', min_samples_split = 50) 
clstree2 = DecisionTreeClassifier(criterion='gini', min_samples_split = 40) 
clstree2.fit(X_train, y_train) 

plt.figure(figsize=(20, 10)) 
plot_tree(clstree2, filled=True, feature_names=X_train.columns, rounded=True) 
plt.show() 

In [None]:
acc_sc_trn = accuracy_score(y_train, clstree2.predict(X_train)) 
print(f"Train Accuracy Score : {acc_sc_trn}") 

acc_sc_tst = accuracy_score(y_test,clstree2.predict(X_test)) 
print(f"Test Accuracy Score : {acc_sc_tst}") 