In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import utils
from sklearn import svm, naive_bayes, neighbors, ensemble, linear_model, tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report, make_scorer, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from mlxtend.evaluate import bias_variance_decomp
import pprint

%matplotlib inline

In [11]:
X["Is_Int"].value_counts()

2    390
6    362
1    250
3    199
4    146
0    107
5     23
Name: Is_Int, dtype: int64

In [7]:
# Read the csv file and preprocess it: convert qualitative attributes to integers
train = pd.read_csv("train.csv")
train.drop_duplicates(inplace=True)
X = train.agg(utils.transform_dataset) # utils.transform_dataset is a dicitionary which applies a transforming function on each column

# add a new column to the dataframe indicating whether some columns are integers or not
X["Is_Int"] = 0

X["Is_Int"] = (abs(round(X["Veg_Consump"]) - X["Veg_Consump"]) < 0.01).astype(int) \
+ (abs(round(X["Water_Consump"]) - X["Water_Consump"]) < 0.01).astype(int) \
+ (abs(round(X["Phys_Act"]) - X["Phys_Act"]) < 0.01).astype(int) \
+ (abs(round(X["Time_E_Dev"]) - X["Time_E_Dev"]) < 0.01).astype(int) \
+ (abs(round(X["Age"]) - X["Age"]) < 0.01).astype(int) \
+ (abs(round(X["Meal_Count"]) - X["Meal_Count"]) < 0.01).astype(int)

X["BMI"] = X["Weight"] / X["Height"] ** 2

X_all = X.drop(["Body_Level"], axis=1)
Y_all = X["Body_Level"]

display(X_all)

# drop the columns mentioned above
#X = X.drop(["Veg_Consump", "Water_Consump", "Phys_Act", "Time_E_Dev", "Age", "Meal_Count"], axis=1)


# # loop over the rows of the dataframe
# for i in range(len(X)):
#     # if the value of the column is an integer, change the value of the new column to 1
#     X.iloc[i]["Is_Int"] = \
#     int(abs(round(X.iloc[i]["Veg_Consump"]) - X.iloc[i]["Veg_Consump"]) < 0.01) \
#     + int(abs(round(X.iloc[i]["Water_Consump"]) - X.iloc[i]["Water_Consump"]) < 0.01) \
#     + int(abs(round(X.iloc[i]["Phys_Act"]) - X.iloc[i]["Phys_Act"]) < 0.01) \
#     + int(abs(round(X.iloc[i]["Time_E_Dev"]) - X.iloc[i]["Time_E_Dev"]) < 0.01) \
#     + int(abs(round(X.iloc[i]["Age"]) - X.iloc[i]["Age"]) < 0.01) \
#     + int(abs(round(X.iloc[i]["Meal_Count"]) - X.iloc[i]["Meal_Count"]) < 0.01)

X_train, X_test, Y_train, Y_test = train_test_split(
    X.drop(columns=['Body_Level']),
    X['Body_Level'], test_size=0.2, random_state=42)

display(X_train)
display(X_test)

Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport,Is_Int,BMI
0,0,23,1.722461,51.881263,1,2.663421,1.041110,0,0,3.000000,2,1,0,0.794402,1.391948,0,2,17.486856
1,1,20,1.743702,54.927529,1,2.000000,2.847264,1,0,3.289260,1,1,0,1.680844,2.000000,0,3,18.065315
2,0,18,1.708406,50.000000,1,1.642241,1.099231,1,0,3.452590,1,0,0,0.418875,1.000000,0,2,17.131202
3,0,19,1.690727,49.895716,1,1.212908,1.029703,1,0,3.207071,1,0,0,2.000000,1.000000,0,3,17.454857
4,1,20,1.793315,58.195150,1,2.508835,2.076933,0,0,3.435905,1,1,0,2.026668,1.443328,1,1,18.095627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1472,1,27,1.755938,112.287678,1,1.428289,2.117733,1,0,3.000000,1,1,0,0.485322,0.696948,1,2,36.417803
1473,0,21,1.700996,126.490236,1,3.000000,1.242832,1,0,3.000000,1,1,0,0.530925,0.575969,0,3,43.717007
1474,0,26,1.629191,104.826776,1,3.000000,2.654702,1,0,3.000000,1,1,0,0.000000,0.555468,0,4,39.493737
1475,0,26,1.629225,104.838425,1,3.000000,2.556068,1,0,3.000000,1,1,0,0.016820,0.582840,0,3,39.496477


Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport,Is_Int,BMI
883,0,22,1.754174,133.783955,1,3.000000,1.945950,1,0,3.000000,1,1,0,1.493018,0.906611,0,3,43.476912
1056,0,23,1.634342,82.414477,1,1.853314,2.135552,1,0,1.320768,1,1,0,0.248034,1.727828,0,1,30.854437
258,0,22,1.580000,58.000000,1,2.000000,1.000000,0,0,1.000000,1,1,0,0.000000,0.000000,0,6,23.233456
1081,0,19,1.669354,126.088301,1,3.000000,1.144539,1,0,3.000000,1,1,0,0.922014,0.899673,0,3,45.245762
1206,0,24,1.581527,78.089575,1,2.000000,2.000000,0,0,1.000000,1,1,0,0.057758,0.000000,0,5,31.220499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1144,1,23,1.846290,121.248035,1,3.000000,2.628816,1,0,2.695396,1,1,0,0.975384,0.000000,0,3,35.569264
1308,0,26,1.623938,109.984263,1,3.000000,2.471721,1,0,3.000000,1,1,0,0.000096,0.433463,0,4,41.705339
874,0,21,1.668642,124.704781,1,3.000000,1.156350,1,0,3.000000,1,1,0,0.786828,0.366385,0,3,44.787494
1473,0,21,1.700996,126.490236,1,3.000000,1.242832,1,0,3.000000,1,1,0,0.530925,0.575969,0,3,43.717007


Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport,Is_Int,BMI
1190,0,26,1.607128,103.026858,1,3.000000,1.077253,1,0,3.000000,1,1,0,0.162083,0.824607,0,3,39.888667
178,0,23,1.715597,50.000000,1,2.449267,1.266018,0,0,3.647154,2,1,0,0.866045,0.097234,0,1,16.987890
279,1,17,1.670000,60.000000,0,2.000000,2.000000,0,0,4.000000,3,0,0,0.000000,2.000000,0,6,21.513859
560,0,34,1.681021,77.392179,1,2.796060,1.921601,0,0,1.971472,1,1,0,0.935217,0.704637,1,1,27.387399
881,1,40,1.781032,96.303855,1,2.252698,2.923856,2,0,2.475228,1,1,0,2.165429,0.616045,1,1,30.359892
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1468,0,22,1.803677,160.639405,1,3.000000,2.404049,1,0,3.000000,1,1,0,0.427905,0.639894,0,3,49.378120
493,0,35,1.647514,73.916920,0,3.000000,1.622440,1,0,2.725012,1,1,0,0.902661,0.000000,1,3,27.232382
494,1,26,1.745950,80.018571,1,1.993101,2.364498,1,0,3.171082,1,1,0,1.224743,0.022245,0,2,26.249872
1064,1,23,1.740000,105.000000,1,3.000000,2.000000,1,0,3.000000,1,1,0,1.000000,0.000000,0,6,34.680935


In [8]:
# show imbalance in the dataset classes using value_counts()
display(Y_train.value_counts())

3    540
2    318
1    157
0    155
Name: Body_Level, dtype: int64

In [9]:
# solve the imbalance problem using SMOTE
sm = SMOTE(random_state=42)
X_train_smote, Y_train_smote = sm.fit_resample(X_train, Y_train)
X_smote, Y_smote = sm.fit_resample(X_all, Y_all)
display(Y_train_smote.value_counts())

3    540
1    540
0    540
2    540
Name: Body_Level, dtype: int64

In [11]:
def svm_model(X_train, Y_train, X_test, Y_test):
    # Train the model
    clf = svm.SVC()
    clf.fit(X_train, Y_train)
    # classification report
    y_pred = clf.predict(X_test)
    print("svm: kernel=rbf, C=1.0, gamma='scale'")
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("svm_clf||                Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf

def svm_model_best_params(X_train, Y_train, X_test, Y_test):
    # Train the model
    clf = svm.SVC(kernel='linear', C=10, gamma=0.01)
    clf.fit(X_train, Y_train)
    # classification report
    y_pred = clf.predict(X_test)
    print("svm: kernel=linear C=10 gamma=0.01")
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("svm_clf||                Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf

def svm_model_k_folding(X, Y, k = 10, kernel = 'linear', C = 1.0, gamma = 0.01):
    # svm with k-fold cross validation, k = 10
    clf = svm.SVC(kernel=kernel, C=C, gamma=gamma)
    print("svm: kernel=", kernel, ", C=", C, ", gamma=", gamma)
    scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average='weighted'),
           'recall' : make_scorer(recall_score, average='weighted'), 
           'f1_score' : make_scorer(f1_score, average='weighted')}

    results = cross_validate(clf, X,
                y=Y,
                scoring=scoring,
                cv=k,
                n_jobs=None,
                verbose=0)

    return pd.DataFrame(results)


def svm_model_grid_search(X_train, Y_train, X_test, Y_test, parameters):
    # trying grid search on svm
    svc = svm.SVC()
    clf = GridSearchCV(svc, parameters)
    print("svm: grid search")
    clf.fit(X_train, Y_train)
    print(clf.best_params_)
    y_pred = clf.predict(X_test)
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf.best_estimator_, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("svm_clf_grid_search||    Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf.best_estimator_

def random_forest_model(X_train, Y_train, X_test, Y_test):
    # trying random forest
    clf = ensemble.RandomForestClassifier()
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    print("random forest: n_estimators=100, max_depth=None")
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("random_forest_clf||      Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf

def random_forest_model_k_folding(X, Y, k = 10):
    # svm with k-fold cross validation, k = 10
    clf = ensemble.RandomForestClassifier()
    print("random forest: n_estimators=100, max_depth=None")
    scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average='weighted'),
           'recall' : make_scorer(recall_score, average='weighted'), 
           'f1_score' : make_scorer(f1_score, average='weighted')}

    results = cross_validate(clf, X,
                y=Y,
                scoring=scoring,
                cv=k,
                n_jobs=None,
                verbose=0)

    return pd.DataFrame(results)

def naive_bayes_model(X_train, Y_train, X_test, Y_test):
    # trying naive bayes
    clf = naive_bayes.GaussianNB()
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    print("naive bayes: GaussianNB")
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("naive_bayes_clf||        Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf

def naive_bayes_model_k_folding(X, Y, k = 10):
    # svm with k-fold cross validation, k = 10
    clf = naive_bayes.GaussianNB()
    print("naive bayes: GaussianNB")
    scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average='weighted'),
           'recall' : make_scorer(recall_score, average='weighted'), 
           'f1_score' : make_scorer(f1_score, average='weighted')}

    results = cross_validate(clf, X,
                y=Y,
                scoring=scoring,
                cv=k,
                n_jobs=None,
                verbose=0)

    return pd.DataFrame(results)



In [13]:
svm_parameters = {'kernel':('linear', 'rbf'), 'C':[0.1, 1, 10], 'gamma':[0.01, 0.1, 1]}


In [1]:
svm_model_k_folding(X_all, Y_all, k = 10, kernel = 'linear', C = 10, gamma = 0.01).head(10)

NameError: name 'svm_model_k_folding' is not defined

In [12]:
svm_model_k_folding(X_smote, Y_smote, k = 10, kernel = 'linear', C = 10, gamma = 0.01).head(10)

svm: kernel= linear , C= 10 , gamma= 0.01


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.030948,0.00695,0.988971,0.989234,0.988971,0.988969
1,0.023969,0.004987,0.996324,0.996377,0.996324,0.996323
2,0.022937,0.004986,1.0,1.0,1.0,1.0
3,0.022483,0.005021,1.0,1.0,1.0,1.0
4,0.022939,0.004987,0.992647,0.9927,0.992647,0.992647
5,0.02593,0.005984,0.992647,0.992754,0.992647,0.992619
6,0.023967,0.005495,1.0,1.0,1.0,1.0
7,0.026954,0.005985,1.0,1.0,1.0,1.0
8,0.022951,0.005979,1.0,1.0,1.0,1.0
9,0.023902,0.005018,1.0,1.0,1.0,1.0


In [None]:
random_forest_model_k_folding(X_smote, Y_smote, k = 10).head(10)

In [27]:
# run the models on the original dataset
svm_clf = svm_model(X_train, Y_train, X_test, Y_test)

random_forest_clf = random_forest_model(X_train, Y_train, X_test, Y_test)

naive_bayes_clf = naive_bayes_model(X_train, Y_train, X_test, Y_test)


svm: kernel=rbf, C=1.0, gamma='scale'
              precision    recall  f1-score   support

           1       0.85      0.93      0.89        30
           2       0.87      0.56      0.68        36
           3       0.64      0.90      0.75        80
           4       0.94      0.80      0.87       147

    accuracy                           0.81       293
   macro avg       0.83      0.80      0.80       293
weighted avg       0.84      0.81      0.81       293

svm_clf||                Bias:  0.19658703071672357 Variance:  0.19112627986348124 Error:  0.017064846416382253 


random forest: n_estimators=100, max_depth=None
              precision    recall  f1-score   support

           1       1.00      0.87      0.93        30
           2       0.83      0.94      0.88        36
           3       0.95      0.95      0.95        80
           4       0.99      0.99      0.99       147

    accuracy                           0.96       293
   macro avg       0.94      0.94     

In [28]:
# Trying grid search on svm
# svm_clf_grid_search = svm_model_grid_search(X_train, Y_train, X_test, Y_test, svm_parameters)

svm_clf_best_params = svm_model_best_params(X_train, Y_train, X_test, Y_test)

svm: kernel=linear C=10 gamma=0.01
              precision    recall  f1-score   support

           1       1.00      0.97      0.98        30
           2       0.87      0.92      0.89        36
           3       0.94      0.95      0.94        80
           4       0.99      0.97      0.98       147

    accuracy                           0.96       293
   macro avg       0.95      0.95      0.95       293
weighted avg       0.96      0.96      0.96       293

svm_clf||                Bias:  0.05225255972696246 Variance:  0.040955631399317405 Error:  0.030580204778157 




In [30]:
# run the models on the dataset after applying SMOTE
svm_model(X_train_smote, Y_train_smote, X_test, Y_test)
#svm_model_grid_search(X_train_smote, Y_train_smote, X_test, Y_test, svm_parameters)

svm_model_best_params(X_train_smote, Y_train_smote, X_test, Y_test)

random_forest_model(X_train_smote, Y_train_smote, X_test, Y_test)

naive_bayes_model(X_train_smote, Y_train_smote, X_test, Y_test)


svm: kernel=rbf, C=1.0, gamma='scale'
              precision    recall  f1-score   support

           1       0.97      0.93      0.95        30
           2       0.84      0.89      0.86        36
           3       0.68      0.90      0.77        80
           4       0.97      0.79      0.87       147

    accuracy                           0.85       293
   macro avg       0.86      0.88      0.86       293
weighted avg       0.87      0.85      0.85       293

svm_clf||                Bias:  0.15334470989761093 Variance:  0.15017064846416384 Error:  0.010136518771331059 


svm: kernel=linear C=10 gamma=0.01
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        30
           2       0.90      1.00      0.95        36
           3       0.95      0.96      0.96        80
           4       0.99      0.96      0.98       147

    accuracy                           0.97       293
   macro avg       0.96      0.98      0.97       2

## Quick Statistics

-   1477 data samples.
-   16 attributes (+1 for classes).
-   4 output classes.

## Attributes Description

-   Gender: Male or female.
-   Age: Numeric value.
-   Height: Numeric value (in meters).
-   Weight: Numeric value (in kilograms).
-   Fam_Hist: Does the family have a history with obesity?
-   H_Cal_Consump: High caloric food consumption.
-   Veg_Consump: Frequency of vegetables consumption.
-   Meal_Count: Average number of meals per day.
-   Food_Between_Meals: Frequency of eating between meals.
-   Smoking: Is the person smoking?
-   Water_Consump: Frequency of water consumption.
-   H_Cal_Burn: Does the body have high calories burn rate?
-   Phys_Act: How often does the person do physical activities?
-   Time_E_Dev: How much time does person spend on electronic devices.
-   Alcohol_Consump: Frequency of alcohols consumption.
-   Transport: Which transports does the person usually use?
-   Body_Level: Class of human body level.

# Introduction
This project analyzes the data of human body level classification. The data is collected from 1477 people and contains 16 attributes. The goal is to predict the body level of a person based on the attributes and classify the person into one of the four classes: Body Level 1, Body Level 2, Body Level 3, and Body Level 4.

We try different concepts of the machine learning course on this data and compare the results to find the best model.

# Data Preprocessing
We implemented simple transformations on the data to make it ready for the machine learning algorithms. One example is:
- Transforming the categorical (nominal) data to numerical (ordinal) data.
Food_between_Meals_str_to_int = {
    "no": 0,
    "Sometimes": 1,
    "Frequently": 2,
    "Always": 3,
}
- One-hot encoding the categorical data that has no implicit order.
- Rounding Some of the numerical data to integers. This was done based on the insights we got from the data visualization. Which is explained in the next section.

Data Imbalance
The data is imbalanced. The number of samples in each class is not equal. This can affect the performance of the machine learning algorithms. We tried to solve this problem by using the SMOTE algorithm. 