## Decision Tree 
## best score 0.87

### 1. Libraries


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

### 2. Data

In [4]:
train = pd.read_csv('Data/our_train.csv')
test = pd.read_csv('Data/our_test.csv')
val = pd.read_csv('Data/our_val.csv')
df = pd.read_csv('Data/train.csv')

X_train = train.drop('NObeyesdad', axis=1)
y_train = train['NObeyesdad']
X_test = test.drop('NObeyesdad', axis=1)
y_test = test['NObeyesdad']
X_val = val.drop('NObeyesdad', axis=1)
y_val = val['NObeyesdad']

CAEC_dict = {'no': 0, 'Sometimes': 0.33, 'Frequently': 0.66, 'Always': 1 }
CALC_dict = {'no': 0, 'Sometimes': 0.5, 'Frequently': 1}
X_train['CAEC'] = X_train['CAEC'].map(CAEC_dict)
X_train['CALC'] = X_train['CALC'].map(CALC_dict)
X_test['CAEC'] = X_test['CAEC'].map(CAEC_dict)
X_test['CALC'] = X_test['CALC'].map(CALC_dict)
X_val['CAEC'] = X_val['CAEC'].map(CAEC_dict)
X_val['CALC'] = X_val['CALC'].map(CALC_dict)
cat_cols2 = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC', 'MTRANS']

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cat_cols2:
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.fit_transform(X_test[col])
    X_val[col] = le.fit_transform(X_val[col])

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
y_val = le.fit_transform(y_val)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

### 3. Random parameters

In [5]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred1 = dt.predict(X_val)
class_report1 = classification_report(y_val, y_pred1)
print("Classification report:\n", class_report1)

conf_matrix1 = confusion_matrix(y_val, y_pred1)
print("Confusion matrix:\n", conf_matrix1)

Classification report:
               precision    recall  f1-score   support

           0       0.90      0.88      0.89       525
           1       0.79      0.80      0.79       624
           2       0.81      0.77      0.79       604
           3       0.95      0.95      0.95       714
           4       0.98      1.00      0.99       837
           5       0.67      0.67      0.67       532
           6       0.68      0.71      0.70       523

    accuracy                           0.84      4359
   macro avg       0.83      0.83      0.83      4359
weighted avg       0.84      0.84      0.84      4359

Confusion matrix:
 [[460  56   2   0   0   5   2]
 [ 45 502   1   0   0  63  13]
 [  1   5 465  29   7  25  72]
 [  0   0  24 681   5   0   4]
 [  1   0   3   0 833   0   0]
 [  2  62  28   0   1 359  80]
 [  2  14  49   5   0  83 370]]


Score: 0.84

### 4. Hyperparameters tuning

In [15]:
dt = DecisionTreeClassifier(random_state=42)

param_grid = dict(criterion=['gini', 'entropy', 'log_loss'],
            max_depth=[1, 5, 10, 50, 75, 100],
            min_samples_split=[5, 10, 100, 250, 500],
            max_leaf_nodes=[5, 10, 20, 50, 100])

grid_search = GridSearchCV(estimator = dt, param_grid = param_grid, scoring="accuracy", n_jobs=-1, cv=3, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters: ", best_params)
print("Best score: ", best_score)

Fitting 3 folds for each of 450 candidates, totalling 1350 fits


450 fits failed out of a total of 1350.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
450 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Gaspar\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Gaspar\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "c:\Users\Gaspar\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 352, in fit
    criterion = CRITERIA_CLF[self.criterion](
KeyError: 'log_loss'

 0.34473504 0.34473504 0.34473504 0.34473504 0.34473504 0.34473504
 0.34473504 0.34473504 0.34473504 0.34473504 0.34473504 0.34473504


Best parameters:  {'criterion': 'gini', 'max_depth': 10, 'max_leaf_nodes': 100, 'min_samples_split': 5}
Best score:  0.8755681211397643


Best parameters:  {'criterion': 'gini', 'max_depth': 10, 'max_leaf_nodes': 100, 'min_samples_split': 5}
Best score:  0.8755681211397643

### 5. Best model

In [16]:
best = DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_split=5, max_leaf_nodes=100, random_state=42)
best.fit(X_train, y_train)
y_pred2 = best.predict(X_val)
class_report2 = classification_report(y_val, y_pred2)
print("Classification report:\n", class_report2)
conf_matrix2 = confusion_matrix(y_val, y_pred2)
print("Confusion matrix:\n", conf_matrix2)

Classification report:
               precision    recall  f1-score   support

           0       0.93      0.91      0.92       525
           1       0.82      0.83      0.83       624
           2       0.85      0.84      0.85       604
           3       0.96      0.96      0.96       714
           4       0.99      1.00      1.00       837
           5       0.72      0.74      0.73       532
           6       0.76      0.76      0.76       523

    accuracy                           0.87      4359
   macro avg       0.86      0.86      0.86      4359
weighted avg       0.88      0.87      0.88      4359

Confusion matrix:
 [[478  43   0   0   0   3   1]
 [ 33 518   0   0   0  68   5]
 [  2   2 505  23   3  20  49]
 [  0   0  23 688   2   0   1]
 [  1   0   1   1 834   0   0]
 [  0  55  15   0   0 396  66]
 [  0  11  47   4   0  66 395]]
