In [5]:
import pandas as pd
import numpy as np

In [26]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [49]:
modified_train_df = pd.read_csv('modified_train.csv')
modified_test_df = pd.read_csv('modified_test.csv')


In [50]:
modified_train_df.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI
0,0,1.0,24.443011,1.699998,81.66995,1.0,1.0,2.0,2.983297,1.0,0.0,2.763573,0.0,0.0,0.976473,1.0,0.0,Overweight_Level_II,28.259565
1,1,0.0,18.0,1.56,57.0,1.0,1.0,2.0,3.0,2.0,0.0,2.0,0.0,1.0,1.0,0.0,1.0,Normal_Weight,23.422091
2,2,0.0,18.0,1.71146,50.165754,1.0,1.0,1.880534,1.411685,1.0,0.0,1.910378,0.0,0.866045,1.673584,0.0,0.0,Insufficient_Weight,17.126706
3,3,0.0,20.952737,1.71073,131.274851,1.0,1.0,3.0,3.0,1.0,0.0,1.674061,0.0,1.467863,0.780199,1.0,0.0,Obesity_Type_III,44.855798
4,4,1.0,31.641081,1.914186,93.798055,1.0,1.0,2.679664,1.971472,1.0,0.0,1.979848,0.0,1.967973,0.931721,1.0,0.0,Overweight_Level_II,25.599151


In [51]:
X_train = modified_train_df.drop(columns=['NObeyesdad'])
y_train = modified_train_df['NObeyesdad']
X_test = modified_test_df 

In [52]:
train_id = X_train['id']
test_id = X_test['id']

X_train.drop(columns=['id'], inplace=True)
X_test.drop(columns=['id'], inplace=True)

In [53]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [54]:
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(kernel='linear', decision_function_shape='ovr'),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
}

In [55]:
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)


In [56]:
for name, classifier in classifiers.items():
    print(f"Training {name}...")

    classifier.fit(X_train_split, y_train_split)
    
    y_pred_val = classifier.predict(X_val)
    
    accuracy_val = accuracy_score(y_val, y_pred_val)
    print(f"Accuracy of {name}: {accuracy_val:.4f}")
    
    print(f"Classification Report of {name} (Validation):\n{classification_report(y_val, y_pred_val)}")
    
    print(f"Confusion Matrix of {name} (Validation):\n{confusion_matrix(y_val, y_pred_val)}")
    
    if hasattr(classifier, 'feature_importances_'):
        feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': classifier.feature_importances_})
        feature_importances.sort_values(by='Importance', ascending=False, inplace=True)
        print(f"Feature Importances of {name}:\n{feature_importances}")
    
    print("--------------------------------------------------")

Training Decision Tree...
Accuracy of Decision Tree: 0.8389
Classification Report of Decision Tree (Validation):
                     precision    recall  f1-score   support

Insufficient_Weight       0.91      0.90      0.90       524
      Normal_Weight       0.80      0.80      0.80       626
     Obesity_Type_I       0.80      0.81      0.80       543
    Obesity_Type_II       0.95      0.93      0.94       657
   Obesity_Type_III       0.99      1.00      0.99       804
 Overweight_Level_I       0.63      0.64      0.64       484
Overweight_Level_II       0.69      0.68      0.69       514

           accuracy                           0.84      4152
          macro avg       0.82      0.82      0.82      4152
       weighted avg       0.84      0.84      0.84      4152

Confusion Matrix of Decision Tree (Validation):
[[470  48   0   0   0   6   0]
 [ 45 502   4   0   1  68   6]
 [  2   2 439  22   2  25  51]
 [  0   0  36 611   1   0   9]
 [  0   0   2   1 800   0   1]
 [  2  63 

The best model is **Gradient Boosting Classifier** with an accuracy of **`0.9017`**