## Import Needed Libraries

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder , StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings("ignore")

In [47]:
train=pd.read_csv('train.csv')

## Explor Data

In [48]:
train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [49]:
train.shape

(20758, 18)

In [50]:
train.columns

Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [51]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

In [52]:
train.isnull().sum()

id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [53]:
train.describe()

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,10378.5,23.841804,1.700245,87.887768,2.445908,2.761332,2.029418,0.981747,0.616756
std,5992.46278,5.688072,0.087312,26.379443,0.533218,0.705375,0.608467,0.838302,0.602113
min,0.0,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,5189.25,20.0,1.631856,66.0,2.0,3.0,1.792022,0.008013,0.0
50%,10378.5,22.815416,1.7,84.064875,2.393837,3.0,2.0,1.0,0.573887
75%,15567.75,26.0,1.762887,111.600553,3.0,3.0,2.549617,1.587406,1.0
max,20757.0,61.0,1.975663,165.057269,3.0,4.0,3.0,3.0,2.0


## Transform Data & Pre-Processing

In [54]:
la=LabelEncoder()

In [55]:
obj=train.select_dtypes(include='object')
non_obj=train.select_dtypes(exclude='object')

In [56]:
for i in range(0,obj.shape[1]):
    obj.iloc[:,i]=la.fit_transform(obj.iloc[:,i])

In [57]:
obj=obj.astype("int")

In [58]:
obj.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad
0,1,1,1,2,0,0,1,3,6
1,0,1,1,1,0,0,2,0,1
2,0,1,1,2,0,0,2,3,0
3,0,1,1,2,0,0,1,3,4
4,1,1,1,2,0,0,1,3,6


In [59]:
data=pd.concat([obj,non_obj],axis=1)

In [60]:
obj.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad
0,1,1,1,2,0,0,1,3,6
1,0,1,1,1,0,0,2,0,1
2,0,1,1,2,0,0,2,3,0
3,0,1,1,2,0,0,1,3,4
4,1,1,1,2,0,0,1,3,6


In [61]:
test=pd.read_csv('test.csv')

In [62]:
obj=test.select_dtypes(include='object')
non_obj=test.select_dtypes(exclude='object')
for i in range(0,obj.shape[1]):
    obj.iloc[:,i]=la.fit_transform(obj.iloc[:,i])
obj=obj.astype("int")    
test=pd.concat([obj,non_obj],axis=1)

In [63]:
test.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,1,1,1,2,0,0,2,3,20758,26.899886,1.848294,120.644178,2.938616,3.0,2.825629,0.8554,0.0
1,0,1,1,2,0,0,2,3,20759,21.0,1.6,66.0,2.0,1.0,3.0,1.0,0.0
2,0,1,1,2,0,0,2,3,20760,26.0,1.643355,111.600553,3.0,3.0,2.621877,0.0,0.250502
3,1,1,1,2,0,0,2,3,20761,20.979254,1.553127,103.669116,2.0,2.977909,2.786417,0.094851,0.0
4,0,1,1,2,0,0,2,3,20762,26.0,1.627396,104.835346,3.0,3.0,2.653531,0.0,0.741069


In [64]:
sc=StandardScaler()

In [65]:
scal1=data[['Age']]
scal2=data[['Weight']]
data['Age']=sc.fit_transform(scal1)
data['Weight']=sc.fit_transform(scal2)

In [66]:
data

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,1,1,1,2,0,0,1,3,6,0,0.105699,1.699998,-0.235713,2.000000,2.983297,2.763573,0.000000,0.976473
1,0,1,1,1,0,0,2,0,1,1,-1.027052,1.560000,-1.170931,2.000000,3.000000,2.000000,1.000000,1.000000
2,0,1,1,2,0,0,2,3,0,2,-1.027052,1.711460,-1.430012,1.880534,1.411685,1.910378,0.866045,1.673584
3,0,1,1,2,0,0,1,3,4,3,-0.507929,1.710730,1.644770,3.000000,3.000000,1.674061,1.467863,0.780199
4,1,1,1,2,0,0,1,3,6,4,1.371197,1.914186,0.224054,2.679664,1.971472,1.979848,1.967973,0.931721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,1,1,1,2,0,0,1,3,3,20753,0.227725,1.766626,0.996987,2.919584,3.000000,2.151809,1.330519,0.196680
20754,1,0,1,1,0,0,1,3,0,20754,-1.027052,1.710000,-1.436296,3.000000,4.000000,1.000000,2.000000,1.000000
20755,1,1,1,2,0,0,2,3,3,20755,-0.657669,1.819557,0.670717,2.407817,3.000000,2.000000,1.158040,1.198439
20756,1,1,1,2,0,0,2,0,6,20756,1.760067,1.700000,-0.165574,2.671238,1.971472,2.144838,0.000000,0.973834


In [67]:
scal1=test[['Age']]
scal2=test[['Weight']]
test['Age']=sc.fit_transform(scal1)
test['Weight']=sc.fit_transform(scal2)

## Creat Model

In [68]:
x=data.drop(['NObeyesdad','id'],axis=1)
y=data['NObeyesdad']

In [69]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=21)

In [70]:
x_train

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
7003,1,1,1,2,0,0,1,3,-0.147998,1.700216,-0.248866,2.0,2.110937,2.000000,2.000000,1.493716
8427,0,1,1,2,0,0,1,3,0.379434,1.641784,0.911801,3.0,3.000000,2.722276,0.000000,0.079334
9429,0,1,1,3,0,0,1,3,-0.499620,1.620000,-0.678111,2.0,1.000000,2.000000,1.000000,1.000000
15534,0,1,1,2,0,0,1,3,-0.399036,1.698346,-0.488565,2.0,3.000000,2.000000,1.577824,1.865851
411,0,1,1,2,0,0,1,3,0.326000,1.662978,0.834484,3.0,3.000000,1.553198,0.319156,0.802136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16432,0,1,1,2,0,0,1,3,0.379434,1.622418,0.898933,3.0,3.000000,2.609188,0.000000,0.125235
8964,0,1,1,2,0,0,1,3,0.379434,1.641132,0.884088,3.0,3.000000,2.715572,0.000000,0.094213
5944,0,1,1,2,0,0,1,3,-0.499620,1.730000,-0.374838,2.0,1.000000,2.000000,1.000000,0.000000
5327,0,1,1,2,0,0,1,3,-0.713540,1.680762,1.746285,3.0,3.000000,2.835622,1.412357,0.442456


In [71]:
y_train

7003     6
8427     4
9429     2
15534    5
411      4
        ..
16432    4
8964     4
5944     5
5327     4
15305    1
Name: NObeyesdad, Length: 16606, dtype: int32

In [72]:
model1=LogisticRegression()
model2=RandomForestClassifier()
model3=GaussianNB()
model4=SVC()
model5=XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=300)
model6=GradientBoostingClassifier(learning_rate=0.1, max_depth=5, n_estimators=300)

In [73]:
def pred(model):
    model.fit(x_train,y_train)
    pre=model.predict(x_test)
    print(classification_report(pre,y_test))

In [74]:
pred(model1)

              precision    recall  f1-score   support

           0       0.86      0.79      0.82       560
           1       0.72      0.71      0.71       642
           2       0.78      0.81      0.79       574
           3       0.96      0.92      0.94       634
           4       1.00      0.99      1.00       823
           5       0.58      0.67      0.62       418
           6       0.63      0.62      0.63       501

    accuracy                           0.81      4152
   macro avg       0.79      0.79      0.79      4152
weighted avg       0.81      0.81      0.81      4152



In [75]:
pred(model2)

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       528
           1       0.90      0.85      0.87       665
           2       0.88      0.91      0.89       581
           3       0.97      0.96      0.97       611
           4       1.00      1.00      1.00       820
           5       0.74      0.84      0.79       428
           6       0.84      0.79      0.81       519

    accuracy                           0.90      4152
   macro avg       0.89      0.90      0.89      4152
weighted avg       0.91      0.90      0.90      4152



In [76]:
pred(model3)

              precision    recall  f1-score   support

           0       0.88      0.69      0.77       668
           1       0.47      0.64      0.54       459
           2       0.60      0.40      0.48       893
           3       0.93      0.69      0.79       826
           4       1.00      0.96      0.98       853
           5       0.27      0.61      0.38       217
           6       0.26      0.53      0.35       236

    accuracy                           0.66      4152
   macro avg       0.63      0.65      0.61      4152
weighted avg       0.74      0.66      0.69      4152



In [77]:
pred(model4)

              precision    recall  f1-score   support

           0       0.93      0.85      0.89       570
           1       0.78      0.81      0.79       610
           2       0.82      0.85      0.83       580
           3       0.97      0.95      0.96       618
           4       1.00      1.00      1.00       820
           5       0.63      0.72      0.67       422
           6       0.74      0.68      0.71       532

    accuracy                           0.85      4152
   macro avg       0.84      0.84      0.84      4152
weighted avg       0.86      0.85      0.85      4152



In [78]:
pred(model5)

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       530
           1       0.89      0.88      0.88       643
           2       0.87      0.91      0.89       575
           3       0.98      0.96      0.97       619
           4       1.00      1.00      1.00       821
           5       0.78      0.85      0.81       444
           6       0.86      0.81      0.83       520

    accuracy                           0.91      4152
   macro avg       0.90      0.90      0.90      4152
weighted avg       0.91      0.91      0.91      4152



In [79]:
pred(model6)

              precision    recall  f1-score   support

           0       0.93      0.92      0.93       525
           1       0.88      0.86      0.87       645
           2       0.88      0.91      0.90       584
           3       0.97      0.97      0.97       609
           4       1.00      1.00      1.00       820
           5       0.76      0.83      0.79       447
           6       0.85      0.80      0.83       522

    accuracy                           0.91      4152
   macro avg       0.90      0.90      0.90      4152
weighted avg       0.91      0.91      0.91      4152



In [80]:
prim_grid={'n_estimators':[100,200,300],
         'learning_rate':[0.1,0.01,0.001],
          'max_depth':[3,5,7]}
scorer="accuracy"

In [81]:
model_5=GridSearchCV(model5,prim_grid,scoring=scorer, n_jobs=-1)
model_5.fit(x_train,y_train)
print(model_5.best_params_)
print(model_5.best_score_)

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
0.904913943179167


In [82]:
testx=test.drop('id',axis=1)

In [83]:
prex=model5.predict(testx)

In [84]:
submission=pd.DataFrame({"id":test['id'],"NObeyesdad":prex})

In [88]:
submission

Unnamed: 0,id,NObeyesdad
0,20758,3
1,20759,5
2,20760,4
3,20761,2
4,20762,4
...,...,...
13835,34593,6
13836,34594,1
13837,34595,0
13838,34596,1


In [89]:
submission.to_csv('submission.csv', index=False)