In [1]:
import pandas as pd
import numpy as np

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score

In [16]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [3]:
modified_train_df = pd.read_csv('modified_train.csv')
modified_test_df = pd.read_csv('modified_test.csv')

In [4]:
X_train = modified_train_df.drop(columns=['NObeyesdad'])
y_train = modified_train_df['NObeyesdad']
X_test = modified_test_df 

In [5]:
train_id = X_train['id']
test_id = X_test['id']

X_train.drop(columns=['id'], inplace=True)
X_test.drop(columns=['id'], inplace=True)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
selector = SelectKBest(mutual_info_classif, k=10)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)

In [10]:
selected_feature_indices = selector.get_support(indices=True)
selected_features = X_train.columns[selected_feature_indices]

### Hyperparameter Optimization

In [25]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
}


In [26]:
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf'],
    'gamma': ['scale', 'auto']
}


In [27]:
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5],
}


In [28]:
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20],
}


In [29]:
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='accuracy')
grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv=5, scoring='accuracy')
grid_search_gb = GridSearchCV(GradientBoostingClassifier(), param_grid_gb, cv=5, scoring='accuracy')
grid_search_dt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, cv=5, scoring='accuracy')

In [30]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [31]:
grid_search_rf.fit(X_train_selected, y_train)

In [40]:
grid_search_svm.fit(X_train_selected, y_train)

In [32]:
grid_search_gb.fit(X_train_selected, y_train)

In [33]:
grid_search_dt.fit(X_train_selected, y_train)

In [48]:
best_rf = grid_search_rf.best_estimator_
best_svm = grid_search_svm.best_estimator_
best_gb = grid_search_gb.best_estimator_
# best_dt = grid_search_dt.best_estimator_


### Ensemble Model

In [79]:
ensemble_model = VotingClassifier(estimators=[
    ('rf', best_rf),
    ('svm', best_svm),
    ('gb', best_gb),
   ('dt', best_dt)
])


In [64]:
X_train_ensemble, X_val_ensemble, y_train_ensemble, y_val_ensemble = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42
)


In [80]:
ensemble_model.fit(X_train_ensemble, y_train_ensemble)

In [81]:
y_pred_val_ensemble = ensemble_model.predict(X_val_ensemble)

In [82]:
accuracy_val_ensemble = accuracy_score(y_val_ensemble, y_pred_val_ensemble)
print(f"Accuracy of Ensemble: {accuracy_val_ensemble:.4f}")

Accuracy of Ensemble: 0.9005


### Ensemble Model 2

In [105]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier

In [108]:
rf_clf2 = RandomForestClassifier()
svm_clf2 = SVC()
gb_clf2 = GradientBoostingClassifier()
dt_clf2 = DecisionTreeClassifier()
gnb_clf2 = GaussianNB()
kn_clf2 = KNeighborsClassifier()
xgb_clf2 = XGBClassifier()
sdg_clf2 = SGDClassifier()

In [124]:
ensemble_model2 = VotingClassifier(estimators=[
    ('rf2', rf_clf2),
    ('svm2', svm_clf2),
    ('gb2', gb_clf2),
    ('xgb2', xgb_clf2),
    ('sgdc2', sdg_clf2)
], voting='hard')

In [133]:
X_train_ensemble, X_val_ensemble, y_train_ensemble, y_val_ensemble = train_test_split(
    X_train_scaled, y_train, test_size=0.25, random_state=42
)


In [130]:
ensemble_model2.fit(X_train_ensemble, y_train_ensemble)

In [134]:
y_pred_val_ensemble2 = ensemble_model2.predict(X_val_ensemble)

In [135]:
accuracy_val_ensemble2 = accuracy_score(y_val_ensemble, y_pred_val_ensemble2)
print(f"Validation Accuracy of Ensemble: {accuracy_val_ensemble2:.4f}")

Validation Accuracy of Ensemble: 0.9171


This is best performing model so far with an accuracy of **`0.9171`**

In [137]:
from sklearn.metrics import classification_report, confusion_matrix


In [138]:
print("Classification Report:")
print(classification_report(y_val_ensemble, y_pred_val_ensemble2))

Classification Report:
                     precision    recall  f1-score   support

Insufficient_Weight       0.94      0.95      0.94       640
      Normal_Weight       0.89      0.89      0.89       780
     Obesity_Type_I       0.90      0.90      0.90       685
    Obesity_Type_II       0.98      0.98      0.98       825
   Obesity_Type_III       1.00      1.00      1.00      1017
 Overweight_Level_I       0.81      0.82      0.82       611
Overweight_Level_II       0.84      0.82      0.83       632

           accuracy                           0.92      5190
          macro avg       0.91      0.91      0.91      5190
       weighted avg       0.92      0.92      0.92      5190



In [139]:

print("Confusion Matrix:")
print(confusion_matrix(y_val_ensemble, y_pred_val_ensemble2))

Confusion Matrix:
[[ 605   32    0    0    0    2    1]
 [  35  696    0    0    0   46    3]
 [   2    1  615   15    1   10   41]
 [   0    0   12  810    1    0    2]
 [   0    0    1    1 1015    0    0]
 [   1   41   14    0    0  503   52]
 [   0   13   43    3    0   57  516]]
