## Decission Tree Model

In [42]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from pickle import dump

In [43]:
train_data = pd.read_csv("data/clean/clean_train.csv")
test_data = pd.read_csv("data/clean/clean_test.csv")

x_train = train_data.drop(["Outcome"], axis = 1)
y_train = train_data["Outcome"]
x_test = test_data.drop(["Outcome"], axis = 1)
y_test = test_data["Outcome"]

x_train.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1,130,70,13,105,25.9,0.472,22
1,2,175,88,0,0,22.9,0.326,22
2,0,161,50,0,0,21.9,0.254,65
3,8,107,80,0,0,24.6,0.856,34
4,7,81,78,40,48,46.7,0.261,42
5,8,120,0,0,0,30.0,0.183,38
6,1,71,62,0,0,21.8,0.416,26
7,7,102,74,40,105,37.2,0.204,45
8,2,100,66,20,90,32.9,0.867,28
9,0,167,0,0,0,32.3,0.839,30


In [44]:
model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)

In [45]:
# Make predictions on the test set
y_pred = model.predict(x_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Detailed classification report
report = classification_report(y_test, y_pred)
print(report)

#SAVING THE MODEL
dump(model, open("models/tree_classifier_default_42.sav", "wb"))

Accuracy: 0.61
              precision    recall  f1-score   support

           0       0.66      0.71      0.68        91
           1       0.52      0.45      0.48        62

    accuracy                           0.61       153
   macro avg       0.59      0.58      0.58       153
weighted avg       0.60      0.61      0.60       153



## Optimization

In [46]:
hyperparams = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 5)

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

grid.fit(x_train, y_train)

print(f"Best hyperparameters: {grid.best_params_}")

Best hyperparameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5}


In [47]:
model = DecisionTreeClassifier(criterion = grid.best_params_['criterion'], max_depth = grid.best_params_['max_depth'],\
                                min_samples_leaf = grid.best_params_['min_samples_leaf'],\
                                      min_samples_split = grid.best_params_['min_samples_split'], random_state = 42)
model.fit(x_train, y_train)

In [48]:
y_pred = model.predict(x_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Detailed classification report
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.69
              precision    recall  f1-score   support

           0       0.70      0.81      0.76        91
           1       0.65      0.50      0.56        62

    accuracy                           0.69       153
   macro avg       0.68      0.66      0.66       153
weighted avg       0.68      0.69      0.68       153



In [49]:
best_hyperparams_string = f"_criterion-{grid.best_params_['criterion']}_max_depth-{grid.best_params_['max_depth']}_min_samples_leaf-{grid.best_params_['min_samples_leaf']}_min_samples_split-{grid.best_params_['min_samples_split']}"


dump(model, open(f"models/tree_classifier_{best_hyperparams_string}_42.sav", "wb"))

print(best_hyperparams_string)

_criterion-gini_max_depth-5_min_samples_leaf-2_min_samples_split-5
