In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import utils
from sklearn import svm, naive_bayes, neighbors, ensemble, linear_model, tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from mlxtend.evaluate import bias_variance_decomp
from sklearn.ensemble import AdaBoostClassifier
import lightgbm as lgb

%matplotlib inline

In [2]:
# Read the csv file and preprocess it: convert qualitative attributes to integers
train = pd.read_csv("train.csv")
train.drop_duplicates(inplace=True)
# train = pd.concat([train, pd.get_dummies(train.loc[:,['Transport', 'Food_Between_Meals', 'Alcohol_Consump']], drop_first=True)], axis=1)
# train.drop(columns=['Transport', 'Alcohol_Consump', 'Food_Between_Meals'], inplace=True)
# utils.transform_dataset is a dicitionary which applies a transforming function on each column
X = train.agg(utils.transform_dataset)

# X_train, X_test, Y_train, Y_test = train_test_split(
#     X.drop(columns=['Body_Level']),
#     X['Body_Level'], test_size=0.2, random_state=42)

X_train = X.sample(frac=0.8, random_state=42)
X_test = X.drop(X_train.index)

Y_train = X_train["Body_Level"]
Y_test = X_test["Body_Level"]

X_train = X_train.drop("Body_Level", axis=1)
X_test = X_test.drop("Body_Level", axis=1)

display(X_train)
display(X_test)


Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport
1190,0,26,1.607128,103.026858,1,3.000000,1.077253,1,0,3.000000,1,1,0,0.162083,0.824607,0
178,0,23,1.715597,50.000000,1,2.449267,1.266018,0,0,3.647154,2,1,0,0.866045,0.097234,0
279,1,17,1.670000,60.000000,0,2.000000,2.000000,0,0,4.000000,3,0,0,0.000000,2.000000,0
560,0,34,1.681021,77.392179,1,2.796060,1.921601,0,0,1.971472,1,1,0,0.935217,0.704637,1
881,1,40,1.781032,96.303855,1,2.252698,2.923856,2,0,2.475228,1,1,0,2.165429,0.616045,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1083,1,20,1.836669,105.257543,1,2.000000,2.793505,0,0,3.000000,1,1,0,2.219390,0.202902,0
488,1,18,1.778447,80.273807,1,1.570089,2.000000,0,0,3.000000,1,1,0,2.707882,0.000000,0
159,1,18,1.850000,60.000000,1,3.000000,2.000000,1,0,4.000000,1,1,1,2.000000,0.000000,1
16,0,20,1.530884,39.371523,1,1.522001,1.981260,1,0,3.000000,1,0,0,2.306844,0.720454,0


Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport
1,1,20,1.743702,54.927529,1,2.000000,2.847264,1,0,3.289260,1,1,0,1.680844,2.000000,0
8,0,23,1.596466,44.594588,0,2.594653,1.916812,0,0,1.882158,2,0,0,0.417119,0.000000,0
13,0,17,1.710948,45.248627,1,2.910733,2.204263,1,0,3.125440,1,0,0,2.407906,1.403037,0
14,1,17,1.800000,58.000000,1,2.000000,2.000000,0,0,3.000000,2,0,0,2.000000,1.000000,4
20,1,18,1.722547,53.783977,1,2.000000,2.072194,1,0,3.131032,1,1,0,1.487987,2.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,1,18,1.811189,108.800964,1,2.000000,2.121176,0,0,1.250548,1,1,0,1.000000,0.741069,0
1453,1,26,1.844751,105.025808,1,3.000000,2.925029,1,0,3.000000,1,1,0,2.000000,1.758865,0
1462,0,21,1.721057,132.054793,1,3.000000,1.678791,1,0,3.000000,1,1,0,1.682490,0.818871,0
1473,0,21,1.700996,126.490236,1,3.000000,1.242832,1,0,3.000000,1,1,0,0.530925,0.575969,0


In [3]:
def adaboost_model(X_train, Y_train, X_test, Y_test):
    # trying random forest
    clf = ensemble.AdaBoostClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    print("adaboost: n_estimators=50")
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("adaboost_clf||      Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")
    return clf

def lightgbm_model(X_train, Y_train, X_test, Y_test):
    clf = lgb.LGBMClassifier(objective='multiclass', class_weight='balanced', learning_rate=0.05)
    clf = clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)

    print("lightGBM: ")
    print(classification_report(Y_test, y_pred))
    bvd = bias_variance_decomp(clf, X_train.values, Y_train.values, X_test.values, Y_test.values, random_seed=42)
    print("lightgbm_clf||      Bias: ", bvd[0], "Variance: ", bvd[1], "Error: ", bvd[2], "\n\n")

    return clf

## Original Models

In [4]:
adaboost_clf = adaboost_model(X_train, Y_train, X_test, Y_test)
#  save the model
utils.save_model(adaboost_clf)

lightgbm_clf = lightgbm_model(X_train, Y_train, X_test, Y_test)
# save the model
utils.save_model(lightgbm_clf)


adaboost: n_estimators=50
              precision    recall  f1-score   support

           0       0.56      0.33      0.42        30
           1       0.51      0.67      0.58        36
           2       0.64      0.90      0.75        80
           3       0.96      0.75      0.84       147

    accuracy                           0.74       293
   macro avg       0.66      0.66      0.65       293
weighted avg       0.77      0.74      0.74       293

adaboost_clf||      Bias:  0.3847610921501706 Variance:  0.2354948805460751 Error:  0.24952218430034126 


lightGBM: 
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        30
           1       0.94      0.92      0.93        36
           2       0.96      0.99      0.98        80
           3       1.00      0.99      1.00       147

    accuracy                           0.98       293
   macro avg       0.97      0.97      0.97       293
weighted avg       0.98      0.98      0.

## SMOTE for imbalance data

In [5]:
# solve the imbalance problem using SMOTE
sm = SMOTE(random_state=42)
X_train_smote, Y_train_smote = sm.fit_resample(X_train, Y_train)
display(Y_train_smote.value_counts())

Body_Level
3    533
0    533
1    533
2    533
Name: count, dtype: int64

In [6]:
adaboost_clf = adaboost_model(X_train_smote, Y_train_smote, X_test, Y_test)
#  save the model
utils.save_model(adaboost_clf)

lightgbm_clf = lightgbm_model(X_train_smote, Y_train_smote, X_test, Y_test)
# save the model
utils.save_model(lightgbm_clf)

adaboost: n_estimators=50
              precision    recall  f1-score   support

           0       0.88      0.23      0.37        30
           1       0.49      0.86      0.63        36
           2       0.62      0.82      0.71        80
           3       0.96      0.75      0.84       147

    accuracy                           0.73       293
   macro avg       0.74      0.67      0.64       293
weighted avg       0.80      0.73      0.73       293

adaboost_clf||      Bias:  0.37723549488054603 Variance:  0.2627986348122867 Error:  0.23882252559726966 


lightGBM: 
              precision    recall  f1-score   support

           0       0.97      0.93      0.95        30
           1       0.92      0.92      0.92        36
           2       0.98      0.99      0.98        80
           3       1.00      1.00      1.00       147

    accuracy                           0.98       293
   macro avg       0.96      0.96      0.96       293
weighted avg       0.98      0.98      0

## Feature Engineering

In [7]:
X["Is_Int"] = 0

X["Is_Int"] = (abs(round(X["Veg_Consump"]) - X["Veg_Consump"]) < 0.01).astype(int) \
    + (abs(round(X["Water_Consump"]) - X["Water_Consump"]) < 0.01).astype(int) \
    + (abs(round(X["Phys_Act"]) - X["Phys_Act"]) < 0.01).astype(int) \
    + (abs(round(X["Time_E_Dev"]) - X["Time_E_Dev"]) < 0.01).astype(int) \
    + (abs(round(X["Age"]) - X["Age"]) < 0.01).astype(int) \
    + (abs(round(X["Meal_Count"]) - X["Meal_Count"]) < 0.01).astype(int)


X['BMI'] = X['Weight'] / (X['Height']**2)

X_train = X.sample(frac=0.8, random_state=42)
X_test = X.drop(X_train.index)

Y_train = X_train["Body_Level"]
Y_test = X_test["Body_Level"]

X_train = X_train.drop("Body_Level", axis=1)
X_test = X_test.drop("Body_Level", axis=1)


In [10]:
adaboost_clf = adaboost_model(X_train, Y_train, X_test, Y_test)
#  save the model
utils.save_model(adaboost_clf)

lightgbm_clf = lightgbm_model(X_train, Y_train, X_test, Y_test)
# save the model
utils.save_model(lightgbm_clf)


adaboost: n_estimators=50
              precision    recall  f1-score   support

           0       1.00      0.87      0.93        30
           1       0.90      1.00      0.95        36
           2       0.89      0.99      0.93        80
           3       0.99      0.93      0.96       147

    accuracy                           0.95       293
   macro avg       0.95      0.95      0.94       293
weighted avg       0.95      0.95      0.95       293

adaboost_clf||      Bias:  0.1696928327645051 Variance:  0.006825938566552901 Error:  0.16593856655290104 


lightGBM: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        36
           2       1.00      1.00      1.00        80
           3       1.00      1.00      1.00       147

    accuracy                           1.00       293
   macro avg       1.00      1.00      1.00       293
weighted avg       1.00      1.00      

## Cross Validation

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
import pprint
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average='weighted'),
           'recall' : make_scorer(recall_score, average='weighted'), 
           'f1_score' : make_scorer(f1_score, average='weighted')}

results = cross_validate(adaboost_clf, X.drop(columns=['Body_Level']),
                y=X['Body_Level'],
                scoring=scoring,
                cv=10,
                n_jobs=None, 
                verbose=0)

pd.DataFrame(results)
# type(results)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.300124,0.116672,0.530612,0.365163,0.530612,0.407912
1,0.291986,0.019062,0.965986,0.967571,0.965986,0.966217
2,0.27537,0.01713,0.959184,0.961851,0.959184,0.959537
3,0.255606,0.016764,0.938356,0.949914,0.938356,0.93936
4,0.249084,0.016985,0.520548,0.3627,0.520548,0.400675
5,0.257061,0.01699,0.931507,0.938331,0.931507,0.931873
6,0.260999,0.022305,0.965753,0.969559,0.965753,0.966107
7,0.295496,0.020199,0.938356,0.947696,0.938356,0.937734
8,0.287881,0.017141,0.986301,0.986954,0.986301,0.986366
9,0.262186,0.016151,0.986301,0.986301,0.986301,0.986301
