Decision Tree on regression datasets 

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sb 
import matplotlib.pyplot as plt 

from sklearn import tree 
from sklearn.tree import DecisionTreeRegressor, plot_tree 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, r2_score 

import nbformat 
from IPython import get_ipython

# from dtreeviz import dtreeviz, model  
# import graphviz 
# from IPython.display import Image 
# import pydotplus 

In [None]:
# %run "../Data_Preprocessing/data_preprocess_dtreg.ipynb" 

with open("../Data_Preprocessing/data_preprocess_dtreg.ipynb", "r", encoding="utf-8") as f:
    ntb = nbformat.read(f, as_version = 4) 

ipython = get_ipython() 

for cell in ntb.cells:
    if cell.cell_type == "code":
        print(cell.source) 

        if ("mov_reg_cleaned" in cell.source or "mov_reg" in cell.source):
            ipython.run_cell(cell.source, silent=True) 
            # ipython.run_cell_async(cell.source, silent=True) 

try:
    print("Movies Clean Data : ")
    print(mov_reg_cleaned.head())   # type: ignore 

except NameError as e:
    print(f"Variable not found: {e}")

In [None]:
mov_reg_cleaned     # type: ignore 

In [None]:
mov_reg_cleaned.corr()      # type: ignore 

Variable split (X,y) : 

In [None]:
X = mov_reg_cleaned.loc[:, mov_reg_cleaned.columns != 'Collection']      # type: ignore 
X 

In [None]:
print(type(X)) 

In [None]:
X.shape 

In [None]:
y = mov_reg_cleaned['Collection']      # type: ignore 
y 

In [None]:
print(type(y)) 

In [None]:
y.shape 

Test - Train Split : 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 

print(f" X_train size : {X_train.shape} \n X_test size : {X_test.shape} \n y_train size : {y_train.shape} \n y_test size : {y_test.shape}") 

print(f"X_train :\n{X_train}") 
print(f"X_test :\n{X_test}") 
print(f"y_train :\n{y_train}") 
print(f"y_test :\n{y_test}") 

In [None]:
X_train = pd.DataFrame(X_train, columns=list(X.columns))  
X_train 

In [None]:
y_train = pd.Series(y_train) 
y_train 

Training regression tree - 

In [None]:
regtree = tree.DecisionTreeRegressor(max_depth = 5)   # viable results uptil depth level 5 
regtree.fit(X_train, y_train) 

y_train_prd = regtree.predict(X_train) 
y_test_prd = regtree.predict(X_test)  


In [None]:
y_train_prd 

In [None]:
y_test_prd 

Model performance 

In [None]:
mse_trn = mean_squared_error(y_train, y_train_prd) 
print(f"Train mean Squared Error : {mse_trn}") 

mse_tst = mean_squared_error(y_test, y_test_prd) 
print(f"Test mean Squared Error : {mse_tst}") 

In [None]:
r2_trn = r2_score(y_train, y_train_prd) 
print(f"Train r^2 Score : {r2_trn}") 

r2_tst = r2_score(y_test, y_test_prd) 
print(f"Test r^2 Score : {r2_tst}") 

Plotting decision Tree - 

In [None]:
plt.figure(figsize=(20, 10)) 
plot_tree(regtree, filled=True, feature_names=X_train.columns, rounded=True) 
plt.show() 


In [None]:
# pip install graphviz pydotplus dtreeviz 
'''
# viz = dtreeviz(regtree, X_train, y_train, target_name="Collection", feature_names=list(X_train.columns), title="Regression Decision Tree for movie collection") 
viz = model(regtree, X_train, y_train, target_name="Collection", feature_names=list(X_train.columns)) 
viz.view() 
'''  

In [None]:
'''
dot_data = tree.export_graphviz(regtree, out_file=None) 
graph = pydotplus.graph_from_dot_data(dot_data) 
Image(graph.create_png()) 
# ''' 


Controlling Tree Growth - 

In [None]:
# Controlling levels in tree 

regtree1 = DecisionTreeRegressor(max_depth = 4) 
regtree1.fit(X_train, y_train) 

plt.figure(figsize=(20, 10)) 
plot_tree(regtree1, filled=True, feature_names=X_train.columns, rounded=True) 
plt.show() 

In [None]:
# Minimum observations at internal node 

# regtree2 = DecisionTreeRegressor(min_samples_split = 40) 
regtree2 = DecisionTreeRegressor(min_samples_split = 50) 
regtree2.fit(X_train, y_train) 

plt.figure(figsize=(20, 10)) 
plot_tree(regtree2, filled=True, feature_names=X_train.columns, rounded=True) 
plt.show() 

In [None]:
# Minimum observations at leaf node 

# regtree3 = DecisionTreeRegressor(min_samples_leaf = 30)  
regtree3 = DecisionTreeRegressor(min_samples_leaf = 25, max_depth = 4)   
regtree3.fit(X_train, y_train) 

plt.figure(figsize=(20, 10)) 
plot_tree(regtree3, filled=True, feature_names=X_train.columns, rounded=True) 
plt.show() 