In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import utils
from sklearn import svm, naive_bayes, neighbors, ensemble, linear_model, tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from mlxtend.evaluate import bias_variance_decomp
from sklearn.linear_model import RidgeClassifier
%matplotlib inline

In [4]:
# Read the csv file and preprocess it: convert qualitative attributes to integers
train = pd.read_csv("train.csv").drop_duplicates()
X = train.agg(
    utils.transform_dataset
)  # utils.transform_dataset is a dicitionary which applies a transforming function on each column

X_train, X_test, Y_train, Y_test = train_test_split(
    X.drop(columns=["Body_Level"]), X["Body_Level"], test_size=0.2, random_state=42
)

display(X_train)


Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport
883,0,21.633056,1.754174,133.783955,1,3.000000,1.945950,1,0,3.000000,1,1,0,1.493018,0.906611,0
1056,0,22.518787,1.634342,82.414477,1,1.853314,2.135552,1,0,1.320768,1,1,0,0.248034,1.727828,0
258,0,22.000000,1.580000,58.000000,1,2.000000,1.000000,0,0,1.000000,1,1,0,0.000000,0.000000,0
1081,0,18.634286,1.669354,126.088301,1,3.000000,1.144539,1,0,3.000000,1,1,0,0.922014,0.899673,0
1206,0,23.803904,1.581527,78.089575,1,2.000000,2.000000,0,0,1.000000,1,1,0,0.057758,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1144,1,23.319635,1.846290,121.248035,1,3.000000,2.628816,1,0,2.695396,1,1,0,0.975384,0.000000,0
1308,0,25.964788,1.623938,109.984263,1,3.000000,2.471721,1,0,3.000000,1,1,0,0.000096,0.433463,0
874,0,20.520992,1.668642,124.704781,1,3.000000,1.156350,1,0,3.000000,1,1,0,0.786828,0.366385,0
1473,0,20.908785,1.700996,126.490236,1,3.000000,1.242832,1,0,3.000000,1,1,0,0.530925,0.575969,0


In [5]:
# show imbalance in the dataset classes using value_counts()
display(Y_train.value_counts())

4    540
3    318
2    157
1    155
Name: Body_Level, dtype: int64

In [6]:
# solve the imbalance problem using SMOTE
sm = SMOTE(random_state=42)
X_train_smote, Y_train_smote = sm.fit_resample(X_train, Y_train)
display(Y_train_smote.value_counts())

4    540
2    540
1    540
3    540
Name: Body_Level, dtype: int64

In [7]:
def svm_model(X_train, Y_train, X_test, Y_test):
    # Train the model
    clf = svm.SVC()
    clf.fit(X_train, Y_train)
    # classification report
    y_pred = clf.predict(X_test)
    print("svm: kernel=rbf, C=1.0, gamma='scale'")
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("svm_clf||                Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf

def svm_model_grid_search(X_train, Y_train, X_test, Y_test, parameters):
    # trying grid search on svm
    svc = svm.SVC()
    clf = GridSearchCV(svc, parameters)
    print("svm: grid search")
    clf.fit(X_train, Y_train)
    print(clf.best_params_)
    y_pred = clf.predict(X_test)
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf.best_estimator_, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("svm_clf_grid_search||    Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf.best_estimator_

def random_forest_model(X_train, Y_train, X_test, Y_test):
    # trying random forest
    clf = ensemble.RandomForestClassifier()
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    print("random forest: n_estimators=100, max_depth=None")
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("random_forest_clf||      Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf

def naive_bayes_model(X_train, Y_train, X_test, Y_test):
    # trying naive bayes
    clf = naive_bayes.GaussianNB()
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    print("naive bayes: GaussianNB")
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("naive_bayes_clf||        Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf

def ridge_regression_model_grid_search(X_train, Y_train, X_test, Y_test,parameters):
    # trying ridge regression
    ridge = RidgeClassifier(copy_X=True, random_state=42)
    clf = GridSearchCV(ridge, parameters)
    print("ridge: grid search")
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    print(clf.best_params_)
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("ridge_regression_clf||   Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf


In [11]:
svm_parameters = {
    "kernel": ("linear", "rbf"),
    "C": [0.1, 1, 10],
    "gamma": [0.01, 0.1, 1],
}
ridge_parameters = {"alpha": [0.01, 0.05, 0.1, 0.5, 1, 10]}

In [9]:
# run the models on the original dataset
svm_clf = svm_model(X_train, Y_train, X_test, Y_test)

random_forest_clf = random_forest_model(X_train, Y_train, X_test, Y_test)

naive_bayes_clf = naive_bayes_model(X_train, Y_train, X_test, Y_test)


svm: kernel=rbf, C=1.0, gamma='scale'
              precision    recall  f1-score   support

           1       0.59      0.77      0.67        31
           2       0.39      0.29      0.33        42
           3       0.63      0.78      0.69        80
           4       0.97      0.84      0.90       140

    accuracy                           0.74       293
   macro avg       0.64      0.67      0.65       293
weighted avg       0.75      0.74      0.74       293

svm_clf||                Bias:  0.2540443686006826 Variance:  0.25597269624573377 Error:  0.03774744027303754 


random forest: n_estimators=100, max_depth=None
              precision    recall  f1-score   support

           1       1.00      0.97      0.98        31
           2       0.90      0.90      0.90        42
           3       0.94      0.96      0.95        80
           4       0.99      0.99      0.99       140

    accuracy                           0.97       293
   macro avg       0.96      0.96      0

In [12]:
# Trying grid search on svm
svm_clf_grid_search = svm_model_grid_search(X_train, Y_train, X_test, Y_test, svm_parameters)
ridge_grid_search = ridge_regression_model_grid_search(X_train, Y_train, X_test, Y_test, ridge_parameters)


svm: grid search
{'C': 10, 'gamma': 0.01, 'kernel': 'linear'}
              precision    recall  f1-score   support

           1       0.97      1.00      0.98        31
           2       0.95      0.95      0.95        42
           3       0.99      0.97      0.98        80
           4       1.00      1.00      1.00       140

    accuracy                           0.99       293
   macro avg       0.98      0.98      0.98       293
weighted avg       0.99      0.99      0.99       293

svm_clf_grid_search||    Bias:  0.03535836177474403 Variance:  0.023890784982935155 Error:  0.026860068259385667 


ridge: grid search
{'alpha': 0.01}
              precision    recall  f1-score   support

           1       0.61      0.81      0.69        31
           2       0.54      0.31      0.39        42
           3       0.70      0.62      0.66        80
           4       0.88      0.99      0.93       140

    accuracy                           0.77       293
   macro avg       0.68   

In [13]:
# run the models on the dataset after applying SMOTE
svm_model(X_train_smote, Y_train_smote, X_test, Y_test)
svm_model_grid_search(X_train_smote, Y_train_smote, X_test, Y_test, svm_parameters)
ridge_regression_model_grid_search(X_train_smote, Y_train_smote, X_test, Y_test, ridge_parameters)

random_forest_model(X_train_smote, Y_train_smote, X_test, Y_test)

naive_bayes_model(X_train_smote, Y_train_smote, X_test, Y_test)


svm: kernel=rbf, C=1.0, gamma='scale'
              precision    recall  f1-score   support

           1       0.63      0.77      0.70        31
           2       0.42      0.48      0.44        42
           3       0.62      0.70      0.65        80
           4       0.98      0.81      0.89       140

    accuracy                           0.73       293
   macro avg       0.66      0.69      0.67       293
weighted avg       0.76      0.73      0.74       293

svm_clf||                Bias:  0.27076791808873724 Variance:  0.2696245733788396 Error:  0.0061604095563139935 


svm: grid search
{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           1       0.94      1.00      0.97        31
           2       0.80      0.76      0.78        42
           3       0.85      0.84      0.84        80
           4       0.96      0.97      0.97       140

    accuracy                           0.91       293
   macro avg       0.89     