In [250]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBRFClassifier
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

In [251]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Explore Data

In [252]:
train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [253]:
train['NObeyesdad'].unique()

array(['Overweight_Level_II', 'Normal_Weight', 'Insufficient_Weight',
       'Obesity_Type_III', 'Obesity_Type_II', 'Overweight_Level_I',
       'Obesity_Type_I'], dtype=object)

In [254]:
train.columns

Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [255]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

In [256]:
train.isnull().sum()

id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [257]:
train.duplicated().sum()

0

In [258]:
train.describe()

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,10378.5,23.841804,1.700245,87.887768,2.445908,2.761332,2.029418,0.981747,0.616756
std,5992.46278,5.688072,0.087312,26.379443,0.533218,0.705375,0.608467,0.838302,0.602113
min,0.0,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,5189.25,20.0,1.631856,66.0,2.0,3.0,1.792022,0.008013,0.0
50%,10378.5,22.815416,1.7,84.064875,2.393837,3.0,2.0,1.0,0.573887
75%,15567.75,26.0,1.762887,111.600553,3.0,3.0,2.549617,1.587406,1.0
max,20757.0,61.0,1.975663,165.057269,3.0,4.0,3.0,3.0,2.0


## Transform Data

In [259]:
la = LabelEncoder()

In [260]:
train_obj = train.select_dtypes(include='object')
train_non_obj = train.select_dtypes(exclude='object')

test_obj = test.select_dtypes(include='object')
test_non_obj = test.select_dtypes(exclude='object')

In [261]:
# Tranform object data to numeric in train 
for i in range(0 , train_obj.shape[1]):
    train_obj.iloc[:,i]=la.fit_transform(train_obj.iloc[:,i])
train_obj = train_obj.astype('int')

# Tranform object data to numeric in test 
for i in range(0 , test_obj.shape[1]):
    test_obj.iloc[:,i]=la.fit_transform(test_obj.iloc[:,i])
test_obj = test_obj.astype('int')

In [262]:
train_obj.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad
0,1,1,1,2,0,0,1,3,6
1,0,1,1,1,0,0,2,0,1
2,0,1,1,2,0,0,2,3,0
3,0,1,1,2,0,0,1,3,4
4,1,1,1,2,0,0,1,3,6


In [263]:
test_obj.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS
0,1,1,1,2,0,0,2,3
1,0,1,1,2,0,0,2,3
2,0,1,1,2,0,0,2,3
3,1,1,1,2,0,0,2,3
4,0,1,1,2,0,0,2,3


In [264]:
train_obj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype
---  ------                          --------------  -----
 0   Gender                          20758 non-null  int32
 1   family_history_with_overweight  20758 non-null  int32
 2   FAVC                            20758 non-null  int32
 3   CAEC                            20758 non-null  int32
 4   SMOKE                           20758 non-null  int32
 5   SCC                             20758 non-null  int32
 6   CALC                            20758 non-null  int32
 7   MTRANS                          20758 non-null  int32
 8   NObeyesdad                      20758 non-null  int32
dtypes: int32(9)
memory usage: 729.9 KB


In [265]:
data =pd.concat([train_obj, train_non_obj], axis=1)

In [266]:
test =pd.concat([test_obj, test_non_obj], axis=1)

In [267]:
train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [268]:
ss =StandardScaler()

In [269]:
data['Age'] = ss.fit_transform(data[['Age']]) 
data['Weight'] = ss.fit_transform(data[['Weight']])

test['Age'] = ss.fit_transform(test[['Age']])
test['Weight'] = ss.fit_transform(test[['Weight']])

In [270]:
data.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,1,1,1,2,0,0,1,3,6,0,0.105699,1.699998,-0.235713,2.0,2.983297,2.763573,0.0,0.976473
1,0,1,1,1,0,0,2,0,1,1,-1.027052,1.56,-1.170931,2.0,3.0,2.0,1.0,1.0
2,0,1,1,2,0,0,2,3,0,2,-1.027052,1.71146,-1.430012,1.880534,1.411685,1.910378,0.866045,1.673584
3,0,1,1,2,0,0,1,3,4,3,-0.507929,1.71073,1.64477,3.0,3.0,1.674061,1.467863,0.780199
4,1,1,1,2,0,0,1,3,6,4,1.371197,1.914186,0.224054,2.679664,1.971472,1.979848,1.967973,0.931721


In [271]:
test.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,1,1,1,2,0,0,2,3,20758,0.508163,1.848294,1.273786,2.938616,3.0,2.825629,0.8554,0.0
1,0,1,1,2,0,0,2,3,20759,-0.509128,1.6,-0.818988,2.0,1.0,3.0,1.0,0.0
2,0,1,1,2,0,0,2,3,20760,0.353,1.643355,0.927432,3.0,3.0,2.621877,0.0,0.250502
3,1,1,1,2,0,0,2,3,20761,-0.512705,1.553127,0.623672,2.0,2.977909,2.786417,0.094851,0.0
4,0,1,1,2,0,0,2,3,20762,0.353,1.627396,0.668336,3.0,3.0,2.653531,0.0,0.741069


## Modeling

In [272]:
x = data.drop(['NObeyesdad', 'id'], axis=1)
y = data['NObeyesdad']

In [273]:
testx = test.drop('id', axis=1)

In [274]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=21)

In [275]:
model_1 = LogisticRegression()
model_2 = RandomForestClassifier()
model_3 = GaussianNB()
model_4 = SVC()
model_5 = XGBRFClassifier(learning_rate = 0.1, max_depth = 7, n_estimators = 200)
model_6 = GradientBoostingClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 200)

In [276]:
def prediction(model):
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    print(classification_report(pred, y_test))

In [277]:
prediction(model_1)

              precision    recall  f1-score   support

           0       0.86      0.79      0.82       560
           1       0.72      0.71      0.71       642
           2       0.78      0.81      0.79       574
           3       0.96      0.92      0.94       634
           4       1.00      0.99      1.00       823
           5       0.58      0.67      0.62       418
           6       0.63      0.62      0.63       501

    accuracy                           0.81      4152
   macro avg       0.79      0.79      0.79      4152
weighted avg       0.81      0.81      0.81      4152



In [278]:
prediction(model_2)

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       520
           1       0.89      0.85      0.87       664
           2       0.87      0.90      0.89       582
           3       0.97      0.96      0.97       612
           4       1.00      1.00      1.00       820
           5       0.74      0.83      0.78       434
           6       0.84      0.79      0.82       520

    accuracy                           0.90      4152
   macro avg       0.89      0.89      0.89      4152
weighted avg       0.90      0.90      0.90      4152



In [279]:
prediction(model_3)

              precision    recall  f1-score   support

           0       0.88      0.69      0.77       668
           1       0.47      0.64      0.54       459
           2       0.60      0.40      0.48       893
           3       0.93      0.69      0.79       826
           4       1.00      0.96      0.98       853
           5       0.27      0.61      0.38       217
           6       0.26      0.53      0.35       236

    accuracy                           0.66      4152
   macro avg       0.63      0.65      0.61      4152
weighted avg       0.74      0.66      0.69      4152



In [280]:
prediction(model_4)

              precision    recall  f1-score   support

           0       0.93      0.85      0.89       570
           1       0.78      0.81      0.79       610
           2       0.82      0.85      0.83       580
           3       0.97      0.95      0.96       618
           4       1.00      1.00      1.00       820
           5       0.63      0.72      0.67       422
           6       0.74      0.68      0.71       532

    accuracy                           0.85      4152
   macro avg       0.84      0.84      0.84      4152
weighted avg       0.86      0.85      0.85      4152



In [281]:
prediction(model_5)

              precision    recall  f1-score   support

           0       0.91      0.92      0.91       511
           1       0.90      0.81      0.85       703
           2       0.87      0.89      0.88       584
           3       0.97      0.96      0.97       613
           4       1.00      1.00      1.00       820
           5       0.73      0.83      0.78       432
           6       0.78      0.78      0.78       489

    accuracy                           0.89      4152
   macro avg       0.88      0.88      0.88      4152
weighted avg       0.89      0.89      0.89      4152



In [282]:
prediction(model_6)

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       521
           1       0.89      0.87      0.88       647
           2       0.88      0.91      0.89       579
           3       0.98      0.96      0.97       615
           4       1.00      1.00      1.00       819
           5       0.78      0.82      0.80       458
           6       0.84      0.81      0.82       513

    accuracy                           0.91      4152
   macro avg       0.90      0.90      0.90      4152
weighted avg       0.91      0.91      0.91      4152



## Improve Results

In [283]:
prameters = {'n_estimators':[100, 200, 300], 
             'learning_rate': [0.1, 0.01, 0.001],
             'max_depth': [3, 5, 7]}
score = 'accuracy'

In [284]:
# m5 = GridSearchCV(model_5, prameters , scoring=score, n_jobs=-1)
# m5.fit(x_train, y_train)

# print(m5.best_params_)
# print(m5.best_score_)


In [285]:
# m6 = GridSearchCV(model_6, prameters , scoring=score, n_jobs=-1)
# m6.fit(x_train, y_train)

# print(m6.best_params_)
# print(m6.best_score_)

In [286]:
# Predict using the model
prediction = model_6.predict(testx)

In [287]:
submission = pd.DataFrame({'id': test['id'], 'NObeyesdad': prediction})

In [288]:
submission['NObeyesdad'].unique()

array([3, 5, 4, 2, 0, 1, 6])

In [294]:
submission['NObeyesdad'] = submission['NObeyesdad'].replace(to_replace=[0, 1, 2, 3, 4, 5, 6], value=[
    'Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I', 'Obesity_Type_II', 
    'Obesity_Type_III', 'Overweight_Level_I','Overweight_Level_II'])
submission


Unnamed: 0,id,NObeyesdad
4,20762,Obesity_Type_III


In [295]:
submission.to_csv('submission01.csv', index=False)