In [2]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, root_mean_squared_error,classification_report
from sklearn.datasets import load_breast_cancer,load_diabetes

#### Regression Tree Model

In [19]:
# load the sample datasets
X = load_diabetes(as_frame=True)['data']
y = load_diabetes()['target']

#split the dataset
def split_data(X:pd.DataFrame, y:np.array, stratify = None) -> tuple:
    if stratify is None:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                        random_state=23)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                        random_state=23, stratify=stratify)
            
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(X, y)

In [10]:
# train the model
model = DecisionTreeRegressor(max_depth=5, criterion='squared_error')
model.fit(X_train, y_train)
train_preds = model.predict(X_train) # getting train prediction
test_preds = model.predict(X_test) # getting test prediction

# evaluation
test_rmse = root_mean_squared_error(y_test, test_preds)
train_rmse = root_mean_squared_error(y_train, train_preds)

print(f'train_rmse: {train_rmse}')
print(f'test_rmse: {test_rmse}')

train_rmse: 43.81506564007509
test_rmse: 72.65870382003756


### Decision Trees Classifier

In [20]:
X = load_breast_cancer(as_frame=True)['data']
y = load_breast_cancer()['target']

X_train, X_test, y_train, y_test = split_data(X, y, stratify=y)

In [37]:
# model training
model = DecisionTreeClassifier(max_depth=6, random_state=23,
                               criterion='entropy', min_samples_split=5) # init the model
model.fit(X_train, y_train) # fit the data to the model
train_preds = model.predict(X_train) # get train prediction
test_preds = model.predict(X_test) # get test prediction

# evaluate model performance
test_score = f1_score(y_test, test_preds)
train_score = f1_score(y_train, train_preds)

print(f'Train F1 score: {train_score}')
print(f'Test F1 score: {test_score}')

Train F1 score: 0.9982425307557118
Test F1 score: 0.9370629370629371


In [38]:
print('===========Test Report ====================')
print(classification_report(y_test, test_preds))
print('===========Train Report ====================')
print(classification_report(y_train, train_preds))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89        42
           1       0.94      0.93      0.94        72

    accuracy                           0.92       114
   macro avg       0.91      0.92      0.92       114
weighted avg       0.92      0.92      0.92       114

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       170
           1       1.00      1.00      1.00       285

    accuracy                           1.00       455
   macro avg       1.00      1.00      1.00       455
weighted avg       1.00      1.00      1.00       455



### Assignment

Write a very good article on the following concepts in decision tree
1. Entropy
2. Gini
3. Information Gain
4. Overfitting in Tree algorithms

Note: article should be atleast 100 words and at most 150.