# Data loading

In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo


# fetch dataset
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544)

predict_students_dropout_and_academic_success = fetch_ucirepo(id=697) 
  

X = pd.DataFrame(estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features)
y = pd.DataFrame(estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.targets)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
X.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation


In [None]:
y.head()

Unnamed: 0,NObeyesdad
0,Normal_Weight
1,Normal_Weight
2,Normal_Weight
3,Overweight_Level_I
4,Overweight_Level_II


In [None]:
import sklearn as sk

In [None]:
from sklearn.model_selection import train_test_split
constant_columns = X.columns[X.nunique() == 1]
cols_to_drop = constant_columns.tolist()

X = X.drop(cols_to_drop, axis=1)

# Splitting data into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Checking the shape of the split data
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

((1688, 16), (423, 16), (1688, 1), (423, 1))

In [None]:
y_train=y_train.values.ravel()
y_test=y_test.values.ravel()

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA 
from sklearn.feature_selection import SelectKBest, chi2 
from sklearn.svm import OneClassSVM  
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report,confusion_matrix  

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import time

# Training without Preprocessing

In [None]:
# We have to do the one hot encoding for the categorical columns to be able to train the models!!
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), X_train.select_dtypes(include=['object']).columns)
    ], 
    sparse_threshold=0)  

# Models

models = {'SVM': Pipeline(steps=[('preprocessor', preprocessor),
                                 ('model', SVC(random_state=42))]),
          'Random Forest': Pipeline(steps=[('preprocessor', preprocessor),
                                           ('model', RandomForestClassifier(random_state=42))]),
          'Naive Bayes': Pipeline(steps=[('preprocessor', preprocessor),
                                         ('model', GaussianNB())])}


results_with_cv_without_preprocessing = {}
results_holdout_without_preprocessing = {}
metrics_scores = {}  

# Cross-validation
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy') 
    results_with_cv_without_preprocessing[name + ' CV'] = cv_scores.mean()
    y_pred_cv = cross_val_predict(model, X_train, y_train, cv=5)  
    metrics_scores[name + ' CV Accuracy'] = accuracy_score(y_train, y_pred_cv)
    metrics_scores[name + ' CV Recall'] = recall_score(y_train, y_pred_cv, average='macro')
    metrics_scores[name + ' CV Precision'] = precision_score(y_train, y_pred_cv, average='macro')
    metrics_scores[name + ' CV F1'] = f1_score(y_train, y_pred_cv, average='macro')

# Holdout
for name, model in models.items(): 
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)  # Predicting on test data to calculate other metrics  
    holdout_score = model.score(X_test, y_test)
    results_holdout_without_preprocessing[name + ' Holdout'] = holdout_score
    metrics_scores[name + ' Holdout Accuracy'] = holdout_score
    metrics_scores[name + ' Holdout Recall'] = recall_score(y_test, y_pred, average='macro')
    metrics_scores[name + ' Holdout Precision'] = precision_score(y_test, y_pred, average='macro')
    metrics_scores[name + ' Holdout F1'] = f1_score(y_test, y_pred, average='macro')

print("Results with cross validation and without preprocessig", results_with_cv_without_preprocessing)
print("Results with holdout and without preprocessig", results_holdout_without_preprocessing)
print("Metrics scores", metrics_scores)

Results with cross validation and without preprocessig {'SVM CV': 0.5930170491457869, 'Random Forest CV': 0.5912366337155198, 'Naive Bayes CV': 0.47097080048461015}
Results with holdout and without preprocessig {'SVM Holdout': 0.6193853427895981, 'Random Forest Holdout': 0.6099290780141844, 'Naive Bayes Holdout': 0.46099290780141844}
Metrics scores {'SVM CV Accuracy': 0.5930094786729858, 'SVM CV Recall': 0.5905699672976091, 'SVM CV Precision': 0.6180464082318249, 'SVM CV F1': 0.5735138714740087, 'Random Forest CV Accuracy': 0.5912322274881516, 'Random Forest CV Recall': 0.5879856600704448, 'Random Forest CV Precision': 0.6149042454627845, 'Random Forest CV F1': 0.5735864694400924, 'Naive Bayes CV Accuracy': 0.4709715639810427, 'Naive Bayes CV Recall': 0.4678757839489129, 'Naive Bayes CV Precision': 0.501195014112384, 'Naive Bayes CV F1': 0.40250292941352817, 'SVM Holdout Accuracy': 0.6193853427895981, 'SVM Holdout Recall': 0.6144107849351771, 'SVM Holdout Precision': 0.6370342833258434

# Training with preprocessing

In [None]:
numerical_columns = X_train.select_dtypes(include=['int64', 'float64']).columns
numerical_transformer = Pipeline(steps=[ 
    ('scaler', StandardScaler())  # Standardize features 
])

# Define the preprocessing steps for categorical features
categorical_transformer = Pipeline(steps=[ 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combine preprocessing steps for both numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, X_train.select_dtypes(include=['object']).columns)
    ]
)

# Models
models = {'SVM':  Pipeline(steps=[('preprocessor', preprocessor),
                            ('model', SVC(random_state=42))]),
         'Random Forest': Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', RandomForestClassifier(random_state=42))]), 
         'Naive Bayes': Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', GaussianNB())])}
                           
results_with_cv_with_preprocessing = {}
results_holdout_with_preprocessing= {}
f1_scores_with_preprocessing = {}

training_times = {}

# Cross-validation 
for name, model in models.items():
    start_time = time.time()
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    cv_f1_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_weighted')
    end_time = time.time()
    duration = end_time - start_time
    training_times[name + ' CV'] = duration
    results_with_cv_with_preprocessing[name + ' CV Accuracy'] = cv_scores.mean()
    results_with_cv_with_preprocessing[name + ' CV F1'] = cv_f1_scores.mean()

# Holdout
for name, model in models.items():
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    end_time = time.time()
    duration = end_time - start_time
    training_times[name + ' Holdout'] = duration
    holdout_score = model.score(X_test, y_test)
    f1_score_holdout = f1_score(y_test, y_pred, average='weighted')
    results_holdout_with_preprocessing[name + ' Holdout Accuracy'] = holdout_score
    f1_scores_with_preprocessing[name + ' Holdout F1'] = f1_score_holdout

print("Results with cross validation and with preprocessig",results_with_cv_with_preprocessing)
print("Results with holdout and with preprocessig", results_holdout_with_preprocessing)
print("Training times", training_times)
print("F1 holdout Scores", f1_scores_with_preprocessing)

Results with cross validation and with preprocessig {'SVM CV Accuracy': 0.9093550822608115, 'SVM CV F1': 0.9098345575097013, 'Random Forest CV Accuracy': 0.9354274577283024, 'Random Forest CV F1': 0.9364959207988436, 'Naive Bayes CV Accuracy': 0.5177409442873949, 'Naive Bayes CV F1': 0.46218721569494486}
Results with holdout and with preprocessig {'SVM Holdout Accuracy': 0.9314420803782506, 'Random Forest Holdout Accuracy': 0.9314420803782506, 'Naive Bayes Holdout Accuracy': 0.5153664302600472}
Training times {'SVM CV': 2.4398317337036133, 'Random Forest CV': 5.364426612854004, 'Naive Bayes CV': 0.4008510112762451, 'SVM Holdout': 0.30995965003967285, 'Random Forest Holdout': 0.5888948440551758, 'Naive Bayes Holdout': 0.06816935539245605}
F1 Scores {'SVM Holdout F1': 0.9314659309250005, 'Random Forest Holdout F1': 0.9319148320996857, 'Naive Bayes Holdout F1': 0.46246109974533706}


# Using grid-search to improve the models 

In [None]:
# Hyperparameter tuning
param_grids = {
    "Random Forest": {'model__n_estimators': [100, 300], 'model__max_depth': [None, 10]},
    "SVM": {'model__C': [ 1, 10], 'model__gamma': ['scale', 'auto']},
    'Naive Bayes': {'model__var_smoothing': np.logspace(0,-9, num=100)}
}
best_params = {}
results = {}
performance_metrics = {}
cv_mean_accuracy = {}
cv_mean_f1 = {}

for name, model in models.items(): 
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params[name] = grid_search.best_params_
    
    # Running cross-validation on the best found model parameters
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
    cv_f1_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='f1_weighted')
    cv_mean_accuracy[name] = cv_scores.mean()
    cv_mean_f1[name] = cv_f1_scores.mean()
    
    predictions = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    precision = precision_score(y_test, predictions, average='weighted')
    results[name] = accuracy
    performance_metrics[name] = {
        'F1-Score': f1,
        'Recall': recall,
        'Precision': precision,
        'Accuracy': accuracy,
        'CV Mean Accuracy': cv_mean_accuracy[name],  # Added mean CV accuracy to the metrics
        'CV Mean F1': cv_mean_f1[name]  # Added mean CV F1 to the metrics
    }

for name, metrics in performance_metrics.items():
    print(f"{name} Performance Metrics:")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")

print("Best Hyperparameters:")
for name, params in best_params.items():
    print(f"{name}: {params}")

SVM Performance Metrics:
F1-Score: 0.9478358030786576
Recall: 0.9479905437352246
Precision: 0.9485349283526286
Accuracy: 0.9479905437352246
CV Mean Accuracy: 0.9526030235457308
CV Mean F1: 0.9525693570355841
Random Forest Performance Metrics:
F1-Score: 0.9390583974427303
Recall: 0.9385342789598109
Precision: 0.9402226846494868
Accuracy: 0.9385342789598109
CV Mean Accuracy: 0.9395764928976524
CV Mean F1: 0.9405140508522543
Naive Bayes Performance Metrics:
F1-Score: 0.5719245713034354
Recall: 0.5981087470449172
Precision: 0.6147558835919286
Accuracy: 0.5981087470449172
CV Mean Accuracy: 0.5847084438045406
CV Mean F1: 0.5590906077125369
Best Hyperparameters:
SVM: {'model__C': 10, 'model__gamma': 'auto'}
Random Forest: {'model__max_depth': None, 'model__n_estimators': 300}
Naive Bayes: {'model__var_smoothing': 0.08111308307896872}


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ede9c305-40ce-4946-a04c-086638645822' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>