## Decision Tree 
## best score 0.86

### 1. Libraries


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

### 2. Data

In [2]:
train = pd.read_csv('Data/train_with_BMI.csv')
test = pd.read_csv('Data/test_with_BMI.csv')
val = pd.read_csv('Data/val_with_BMI.csv')

X_train = train.drop('NObeyesdad', axis=1)
y_train = train['NObeyesdad']
X_test = test.drop('NObeyesdad', axis=1)
y_test = test['NObeyesdad']
X_val = val.drop('NObeyesdad', axis=1)
y_val = val['NObeyesdad']

### 3. Random parameters

In [3]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred1 = dt.predict(X_val)
class_report1 = classification_report(y_val, y_pred1)
print("Classification report:\n", class_report1)

conf_matrix1 = confusion_matrix(y_val, y_pred1)
print("Confusion matrix:\n", conf_matrix1)

Classification report:
               precision    recall  f1-score   support

           0       0.88      0.83      0.85       525
           1       0.75      0.77      0.76       624
           2       0.76      0.74      0.75       604
           3       0.90      0.92      0.91       714
           4       0.99      1.00      0.99       837
           5       0.62      0.63      0.63       532
           6       0.64      0.65      0.65       523

    accuracy                           0.81      4359
   macro avg       0.79      0.79      0.79      4359
weighted avg       0.81      0.81      0.81      4359

Confusion matrix:
 [[434  79   1   0   0   9   2]
 [ 54 482   3   0   0  71  14]
 [  2   3 446  48   5  36  64]
 [  0   0  38 657   3   1  15]
 [  0   1   2   1 833   0   0]
 [  2  62  35   0   3 336  94]
 [  1  15  61  20   0  87 339]]


Score: 0.81

### 4. Hyperparameters tuning

In [4]:
dt = DecisionTreeClassifier(random_state=42)

param_grid = dict(criterion=['gini', 'entropy',
 'log_loss'],
            max_depth=[1, 5, 10, 50, 75, 100],
            min_samples_split=[5, 10, 100, 250, 500],
            max_leaf_nodes=[5, 10, 20, 50, 100])

grid_search = GridSearchCV(estimator = dt, param_grid = param_grid, scoring="accuracy", n_jobs=-1, cv=3, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
cv_results = grid_search.cv_results_

print("Best parameters: ", best_params)
print("Best score: ", best_score)
print("Best model cross-validation results: ", cv_results['mean_test_score'][grid_search.best_index_])

Fitting 3 folds for each of 450 candidates, totalling 1350 fits
Best parameters:  {'criterion': 'gini', 'max_depth': 10, 'max_leaf_nodes': 100, 'min_samples_split': 5}
Best score:  0.8585002323721816
Best model cross-validation results:  0.8585002323721816


Best parameters:  {'criterion': 'gini', 'max_depth': 10, 'max_leaf_nodes': 100, 'min_samples_split': 5}  
Best score:  0.8592575235644487  
Cross-validation results:  0.8592575235644487  
No overfitting.

### 5. Best model - test

In [5]:
best = DecisionTreeClassifier(criterion='gini', max_depth=50, min_samples_split=5, max_leaf_nodes=100, random_state=42)
best.fit(X_train, y_train)
y_pred2 = best.predict(X_val)
class_report2 = classification_report(y_val, y_pred2)
print("Classification report:\n", class_report2)
conf_matrix2 = confusion_matrix(y_val, y_pred2)
print("Confusion matrix:\n", conf_matrix2)

Classification report:
               precision    recall  f1-score   support

           0       0.94      0.90      0.92       525
           1       0.82      0.87      0.84       624
           2       0.84      0.80      0.82       604
           3       0.93      0.94      0.94       714
           4       1.00      1.00      1.00       837
           5       0.69      0.70      0.69       532
           6       0.72      0.72      0.72       523

    accuracy                           0.86      4359
   macro avg       0.85      0.85      0.85      4359
weighted avg       0.86      0.86      0.86      4359

Confusion matrix:
 [[475  45   0   0   0   3   2]
 [ 27 540   1   0   0  49   7]
 [  2   1 482  41   3  27  48]
 [  0   0  33 672   1   0   8]
 [  1   0   2   1 833   0   0]
 [  1  64  15   0   0 370  82]
 [  1  11  42   9   0  86 374]]


No overfitting.