In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import utils
from sklearn import svm, naive_bayes, neighbors, ensemble, linear_model, tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from mlxtend.evaluate import bias_variance_decomp
%matplotlib inline

In [2]:
# Read the csv file and preprocess it: convert qualitative attributes to integers
train = pd.read_csv("train.csv")
X = train.agg(utils.transform_dataset) # utils.transform_dataset is a dicitionary which applies a transforming function on each column

X_train = X.sample(frac=0.8, random_state=42)
X_test = X.drop(X_train.index)

Y_train = X_train["Body_Level"]
Y_test = X_test["Body_Level"]

X_train = X_train.drop("Body_Level", axis=1)
X_test = X_test.drop("Body_Level", axis=1)
display(X_train)
display(X_test)

Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport
1233,0,21.768834,1.733383,135.524857,1,3.000000,1.485736,1,0,3.000000,1,1,0,1.950374,0.869238,0
529,1,19.565496,1.705584,78.025625,0,1.936479,2.159987,1,0,2.837388,1,1,0,1.475772,0.646514,0
1177,1,18.000000,1.803527,108.251044,1,2.000000,2.530157,0,0,1.709546,1,1,0,1.000000,0.645400,0
432,0,21.012569,1.758628,78.370039,1,3.000000,2.000000,1,0,1.000000,1,1,0,2.971832,0.000000,0
363,0,23.000000,1.500000,55.000000,1,3.000000,2.000000,1,0,3.000000,1,1,0,1.000000,0.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,1,23.000000,1.700740,81.322970,1,2.000000,1.519700,1,0,2.720642,1,1,0,0.279375,1.043435,0
157,0,18.288205,1.713564,50.000000,1,1.140615,1.000000,1,0,3.471536,1,0,0,2.000000,1.000000,0
16,0,19.948140,1.530884,39.371523,1,1.522001,1.981260,1,0,3.000000,1,0,0,2.306844,0.720454,0
1072,0,38.148845,1.557808,79.661693,1,2.000000,1.274774,1,0,3.000000,1,1,0,0.000000,0.000000,1


Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport
1,1,19.799054,1.743702,54.927529,1,2.000000,2.847264,1,0,3.289260,1,1,0,1.680844,2.000000,0
8,0,23.444286,1.596466,44.594588,0,2.594653,1.916812,0,0,1.882158,2,0,0,0.417119,0.000000,0
13,0,16.928791,1.710948,45.248627,1,2.910733,2.204263,1,0,3.125440,1,0,0,2.407906,1.403037,0
14,1,17.000000,1.800000,58.000000,1,2.000000,2.000000,0,0,3.000000,2,0,0,2.000000,1.000000,4
20,1,18.381382,1.722547,53.783977,1,2.000000,2.072194,1,0,3.131032,1,1,0,1.487987,2.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1450,0,23.000000,1.611058,84.191125,1,2.141280,2.519841,0,0,2.989112,1,1,0,2.111370,0.013483,0
1453,1,26.004294,1.844751,105.025808,1,3.000000,2.925029,1,0,3.000000,1,1,0,2.000000,1.758865,0
1459,0,25.653233,1.664940,110.922170,1,3.000000,1.604075,1,0,3.000000,1,1,0,0.029728,0.200122,0
1462,0,20.978166,1.721057,132.054793,1,3.000000,1.678791,1,0,3.000000,1,1,0,1.682490,0.818871,0


In [3]:
# show imbalance in the dataset classes using value_counts()
display(Y_train.value_counts())

4    533
3    326
2    166
1    157
Name: Body_Level, dtype: int64

In [4]:
# solve the imbalance problem using SMOTE
sm = SMOTE(random_state=42)
X_train_smote, Y_train_smote = sm.fit_resample(X_train, Y_train)
display(Y_train_smote.value_counts())

4    533
3    533
2    533
1    533
Name: Body_Level, dtype: int64

In [17]:
def svm_model(X_train, Y_train, X_test, Y_test):
    # Train the model
    clf = svm.SVC()
    clf.fit(X_train, Y_train)
    # classification report
    y_pred = clf.predict(X_test)
    print("svm: kernel=rbf, C=1.0, gamma='scale'")
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(svm_clf, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("svm_clf||                Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf

def svm_model_grid_search(X_train, Y_train, X_test, Y_test, parameters):
    # trying grid search on svm
    svc = svm.SVC()
    clf = GridSearchCV(svc, parameters)
    print("svm: grid search")
    clf.fit(X_train, Y_train)
    print(clf.best_params_)
    y_pred = clf.predict(X_test)
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf.best_estimator_, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("svm_clf_grid_search||    Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf.best_estimator_

def random_forest_model(X_train, Y_train, X_test, Y_test):
    # trying random forest
    clf = ensemble.RandomForestClassifier()
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    print("random forest: n_estimators=100, max_depth=None")
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("random_forest_clf||      Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf

def naive_bayes_model(X_train, Y_train, X_test, Y_test):
    # trying naive bayes
    clf = naive_bayes.GaussianNB()
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    print("naive bayes: GaussianNB")
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("naive_bayes_clf||        Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf


In [6]:
svm_parameters = {'kernel':('linear', 'rbf'), 'C':[0.1, 1, 10], 'gamma':[0.01, 0.1, 1]}


In [15]:
# run the models on the original dataset
svm_clf = svm_model(X_train, Y_train, X_test, Y_test)

random_forest_clf = random_forest_model(X_train, Y_train, X_test, Y_test)

naive_bayes_clf = naive_bayes_model(X_train, Y_train, X_test, Y_test)


svm: kernel=rbf, C=1.0, gamma='scale'
              precision    recall  f1-score   support

           1       0.77      0.82      0.79        33
           2       0.58      0.51      0.55        35
           3       0.65      0.82      0.73        80
           4       0.95      0.83      0.89       147

    accuracy                           0.79       295
   macro avg       0.74      0.75      0.74       295
weighted avg       0.81      0.79      0.79       295

svm_clf||                Bias:  0.2054915254237288 Variance:  0.20677966101694914 Error:  0.03155932203389831
random forest: n_estimators=100, max_depth=None
              precision    recall  f1-score   support

           1       1.00      0.88      0.94        33
           2       0.79      0.89      0.84        35
           3       0.95      0.94      0.94        80
           4       0.99      1.00      1.00       147

    accuracy                           0.96       295
   macro avg       0.93      0.93      0.93

In [16]:
# Trying grid search on svm
svm_clf_grid_search = svm_model_grid_search(X_train, Y_train, X_test, Y_test, svm_parameters)


svm: grid search
{'C': 10, 'gamma': 0.01, 'kernel': 'linear'}
              precision    recall  f1-score   support

           1       0.94      1.00      0.97        33
           2       1.00      0.91      0.96        35
           3       0.99      1.00      0.99        80
           4       1.00      1.00      1.00       147

    accuracy                           0.99       295
   macro avg       0.98      0.98      0.98       295
weighted avg       0.99      0.99      0.99       295

svm_clf_grid_search||    Bias:  0.022966101694915256 Variance:  0.010169491525423728 Error:  0.016999999999999998


KeyboardInterrupt: 

In [18]:
# run the models on the dataset after applying SMOTE
svm_model(X_train_smote, Y_train_smote, X_test, Y_test)
svm_model_grid_search(X_train_smote, Y_train_smote, X_test, Y_test, svm_parameters)

random_forest_model(X_train_smote, Y_train_smote, X_test, Y_test)

naive_bayes_model(X_train_smote, Y_train_smote, X_test, Y_test)


svm: kernel=rbf, C=1.0, gamma='scale'
              precision    recall  f1-score   support

           1       0.78      0.88      0.83        33
           2       0.48      0.63      0.54        35
           3       0.64      0.68      0.65        80
           4       0.96      0.83      0.89       147

    accuracy                           0.77       295
   macro avg       0.71      0.75      0.73       295
weighted avg       0.80      0.77      0.78       295

svm_clf||                Bias:  0.23198305084745763 Variance:  0.2305084745762712 Error:  0.011067796610169493 


svm: grid search
{'C': 10, 'gamma': 0.01, 'kernel': 'linear'}
              precision    recall  f1-score   support

           1       0.94      1.00      0.97        33
           2       1.00      0.94      0.97        35
           3       1.00      1.00      1.00        80
           4       1.00      1.00      1.00       147

    accuracy                           0.99       295
   macro avg       0.99  