In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler

# algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


## Exploring Data

In [6]:
train.shape

(20758, 18)

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

In [8]:
train.describe()

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,10378.5,23.841804,1.700245,87.887768,2.445908,2.761332,2.029418,0.981747,0.616756
std,5992.46278,5.688072,0.087312,26.379443,0.533218,0.705375,0.608467,0.838302,0.602113
min,0.0,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,5189.25,20.0,1.631856,66.0,2.0,3.0,1.792022,0.008013,0.0
50%,10378.5,22.815416,1.7,84.064875,2.393837,3.0,2.0,1.0,0.573887
75%,15567.75,26.0,1.762887,111.600553,3.0,3.0,2.549617,1.587406,1.0
max,20757.0,61.0,1.975663,165.057269,3.0,4.0,3.0,3.0,2.0


In [9]:
train.isnull().sum()

id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [10]:
train.duplicated().sum()

np.int64(0)

## Data Tranformation

In [20]:
categorical_cols = [var for var in train.columns if train[var].dtype == 'O']
print(f"There are {len(categorical_cols)} categorical variables.")
print(f"Categorical Variables are:  {categorical_cols}")

There are 9 categorical variables.
Categorical Variables are:  ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']


In [21]:
numerical_cols = [var for var in train.columns if train[var].dtype != 'O']
print(f"There are {len(numerical_cols)} numerical variables.")
print(f"Numerical Variables are:  {numerical_cols}")

There are 9 numerical variables.
Numerical Variables are:  ['id', 'Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']


In [22]:
## Another way to Divide "Objects" and "Non-objects" data
obj = train.select_dtypes(include='object')
non_obj = train.select_dtypes(exclude='object')

In [29]:
obj.columns.values

array(['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC',
       'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad'], dtype=object)

In [28]:
non_obj.columns.values

array(['id', 'Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF',
       'TUE'], dtype=object)

In [31]:
label_encoder = LabelEncoder()

In [33]:
for i in range(0, obj.shape[1]): # we can also say: range(0, len(categorical_cols))
    obj.iloc[:, i] = label_encoder.fit_transform(obj.iloc[:, i])

In [34]:
## Data after transformation
obj

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad
0,1,1,1,2,0,0,1,3,6
1,0,1,1,1,0,0,2,0,1
2,0,1,1,2,0,0,2,3,0
3,0,1,1,2,0,0,1,3,4
4,1,1,1,2,0,0,1,3,6
...,...,...,...,...,...,...,...,...,...
20753,1,1,1,2,0,0,1,3,3
20754,1,0,1,1,0,0,1,3,0
20755,1,1,1,2,0,0,2,3,3
20756,1,1,1,2,0,0,2,0,6


In [37]:
## Convert the "Obj" from an "Object" to "int"
obj = obj.astype('int')

In [41]:
## Then we concate the "Object" wint "Non-object" Data to get the all Train data
train_data = pd.concat([obj, non_obj], axis=1)# to put the "Two dataframe" next to each other we make "axis=1"

In [44]:
train_data.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,1,1,1,2,0,0,1,3,6,0,24.443011,1.699998,81.66995,2.0,2.983297,2.763573,0.0,0.976473
1,0,1,1,1,0,0,2,0,1,1,18.0,1.56,57.0,2.0,3.0,2.0,1.0,1.0
2,0,1,1,2,0,0,2,3,0,2,18.0,1.71146,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584
3,0,1,1,2,0,0,1,3,4,3,20.952737,1.71073,131.274851,3.0,3.0,1.674061,1.467863,0.780199
4,1,1,1,2,0,0,1,3,6,4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721


In [42]:
# Apply the same steps on the "Test" data
test = pd.read_csv('test.csv')
obj = test.select_dtypes(include='object')
non_obj = test.select_dtypes(exclude='object')

for i in range(0, obj.shape[1]): 
    obj.iloc[:, i] = label_encoder.fit_transform(obj.iloc[:, i])

obj = obj.astype('int')
test_data = pd.concat([obj, non_obj], axis=1)

In [87]:
test_data.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,1,1,1,2,0,0,2,3,20758,0.508163,1.848294,1.273786,2.938616,3.0,2.825629,0.8554,0.0
1,0,1,1,2,0,0,2,3,20759,-0.509128,1.6,-0.818988,2.0,1.0,3.0,1.0,0.0
2,0,1,1,2,0,0,2,3,20760,0.353,1.643355,0.927432,3.0,3.0,2.621877,0.0,0.250502
3,1,1,1,2,0,0,2,3,20761,-0.512705,1.553127,0.623672,2.0,2.977909,2.786417,0.094851,0.0
4,0,1,1,2,0,0,2,3,20762,0.353,1.627396,0.668336,3.0,3.0,2.653531,0.0,0.741069


In [46]:
## We can notice that each value in "Age" & "Weight" columns has different scale, so to make the Model understand this different we make Standardization
scaler = StandardScaler()

train_data['Age'] = scaler.fit_transform(train_data[['Age']])
train_data['Weight'] = scaler.fit_transform(train_data[['Weight']])

In [47]:
train_data.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,1,1,1,2,0,0,1,3,6,0,0.105699,1.699998,-0.235713,2.0,2.983297,2.763573,0.0,0.976473
1,0,1,1,1,0,0,2,0,1,1,-1.027052,1.56,-1.170931,2.0,3.0,2.0,1.0,1.0
2,0,1,1,2,0,0,2,3,0,2,-1.027052,1.71146,-1.430012,1.880534,1.411685,1.910378,0.866045,1.673584
3,0,1,1,2,0,0,1,3,4,3,-0.507929,1.71073,1.64477,3.0,3.0,1.674061,1.467863,0.780199
4,1,1,1,2,0,0,1,3,6,4,1.371197,1.914186,0.224054,2.679664,1.971472,1.979848,1.967973,0.931721


In [48]:
## Apply the same Standardization on Test Data
test_data['Age'] = scaler.fit_transform(test_data[['Age']])
test_data['Weight'] = scaler.fit_transform(test_data[['Weight']])

In [49]:
test_data.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,1,1,1,2,0,0,2,3,20758,0.508163,1.848294,1.273786,2.938616,3.0,2.825629,0.8554,0.0
1,0,1,1,2,0,0,2,3,20759,-0.509128,1.6,-0.818988,2.0,1.0,3.0,1.0,0.0
2,0,1,1,2,0,0,2,3,20760,0.353,1.643355,0.927432,3.0,3.0,2.621877,0.0,0.250502
3,1,1,1,2,0,0,2,3,20761,-0.512705,1.553127,0.623672,2.0,2.977909,2.786417,0.094851,0.0
4,0,1,1,2,0,0,2,3,20762,0.353,1.627396,0.668336,3.0,3.0,2.653531,0.0,0.741069


## Model Creation

In [50]:
x = train_data.drop(['NObeyesdad', 'id'], axis=1)
y = train_data['NObeyesdad']

In [51]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [56]:
## Initialize The Models
logs = LogisticRegression()
naive = GaussianNB()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
svc = SVC()
xg = XGBClassifier()

In [54]:
def model_prediction(model):
    model.fit(x_train, y_train)
    pre = model.predict(x_test)
    print(classification_report(pre, y_test))

In [57]:
model_prediction(logs)

              precision    recall  f1-score   support

           0       0.87      0.78      0.82       586
           1       0.67      0.70      0.69       600
           2       0.80      0.77      0.78       565
           3       0.97      0.95      0.96       668
           4       1.00      0.99      1.00       807
           5       0.59      0.64      0.61       445
           6       0.60      0.64      0.62       481

    accuracy                           0.81      4152
   macro avg       0.79      0.78      0.78      4152
weighted avg       0.81      0.81      0.81      4152



In [58]:
model_prediction(naive)

              precision    recall  f1-score   support

           0       0.85      0.70      0.77       641
           1       0.47      0.65      0.54       450
           2       0.61      0.37      0.46       897
           3       0.93      0.71      0.81       858
           4       1.00      0.96      0.98       832
           5       0.30      0.61      0.40       239
           6       0.24      0.53      0.33       235

    accuracy                           0.66      4152
   macro avg       0.63      0.65      0.61      4152
weighted avg       0.74      0.66      0.68      4152



In [59]:
model_prediction(rf)

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       511
           1       0.89      0.84      0.87       658
           2       0.86      0.88      0.87       529
           3       0.97      0.97      0.97       661
           4       1.00      1.00      1.00       804
           5       0.76      0.77      0.77       478
           6       0.79      0.80      0.79       511

    accuracy                           0.90      4152
   macro avg       0.88      0.89      0.89      4152
weighted avg       0.90      0.90      0.90      4152



In [60]:
model_prediction(svc)

              precision    recall  f1-score   support

           0       0.94      0.87      0.90       564
           1       0.77      0.81      0.79       593
           2       0.84      0.82      0.83       557
           3       0.97      0.97      0.97       659
           4       1.00      1.00      1.00       804
           5       0.66      0.69      0.67       463
           6       0.69      0.70      0.69       512

    accuracy                           0.85      4152
   macro avg       0.84      0.83      0.84      4152
weighted avg       0.85      0.85      0.85      4152



In [61]:
model_prediction(gb)

              precision    recall  f1-score   support

           0       0.94      0.95      0.94       520
           1       0.88      0.88      0.88       633
           2       0.87      0.88      0.88       537
           3       0.97      0.97      0.97       655
           4       1.00      1.00      1.00       805
           5       0.78      0.78      0.78       486
           6       0.80      0.80      0.80       516

    accuracy                           0.90      4152
   macro avg       0.89      0.89      0.89      4152
weighted avg       0.90      0.90      0.90      4152



In [62]:
model_prediction(xg)

              precision    recall  f1-score   support

           0       0.92      0.95      0.93       506
           1       0.89      0.87      0.88       639
           2       0.86      0.89      0.87       527
           3       0.97      0.97      0.97       656
           4       1.00      0.99      1.00       807
           5       0.79      0.77      0.78       494
           6       0.81      0.79      0.80       523

    accuracy                           0.90      4152
   macro avg       0.89      0.89      0.89      4152
weighted avg       0.90      0.90      0.90      4152



In [63]:
# We notice that the high accuracy we can find it from "GradientBoost" & "XGboost" algorithms so we can make Parameter tuning on it to find the best
prim_grid = {'n_estimators': [100, 200, 300],
             'learning_rate': [0.1, 0.01, 0.001],
             'max_depth': [3, 5, 7]}
scorer = 'accuracy'

In [64]:
gb_model = GridSearchCV(gb, prim_grid, scoring=scorer, n_jobs=-1)
gb_model.fit(x_train, y_train)
print(gb_model.best_params_)
print(gb_model.best_score_)

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
0.905636979642256


In [65]:
xg_model = GridSearchCV(xg, prim_grid, scoring=scorer, n_jobs=-1)
xg_model.fit(x_train, y_train)
print(xg_model.best_params_)
print(xg_model.best_score_)

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
0.9082865301192982


In [66]:
## We will edit the "GradientBoostClassifier" & "XGBClassifier" algorithms with the new parameters
gb = GradientBoostingClassifier(learning_rate= 0.1, max_depth= 3, n_estimators= 300)
xg = XGBClassifier( learning_rate= 0.1, max_depth= 5, n_estimators= 200)

In [67]:
# We will choose GradientBoostingClassifier
model_prediction(gb)

              precision    recall  f1-score   support

           0       0.94      0.95      0.94       520
           1       0.89      0.88      0.89       632
           2       0.87      0.88      0.87       539
           3       0.97      0.98      0.97       654
           4       1.00      1.00      1.00       803
           5       0.78      0.78      0.78       483
           6       0.80      0.79      0.79       521

    accuracy                           0.90      4152
   macro avg       0.89      0.89      0.89      4152
weighted avg       0.90      0.90      0.90      4152



In [82]:
## Test data
test_x = test_data.drop('id', axis=1)
pre_x = gb.predict(test_x)

In [83]:
submission = pd.DataFrame({'Id': test_data['id'], 'NObeyesdad': pre_x})

In [84]:
submission

Unnamed: 0,Id,NObeyesdad
0,20758,3
1,20759,5
2,20760,4
3,20761,2
4,20762,4
...,...,...
13835,34593,6
13836,34594,1
13837,34595,0
13838,34596,1


In [88]:
submission['NObeyesdad'] = label_encoder.inverse_transform(y = pre_x)# return the data we converted to numerical to its Original text

ValueError: y contains previously unseen labels: [5 6]

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index=False)