In [1]:
import os
import pandas as pd

## Variables:

Gender: (female,male)

Age (years): numeric

Height (meters): numeric

Weight (kg): numeric

family_history_with_overweight: (no,yes)

FAVC (Frequent consumption of high caloric food): (no,yes)

FCVC (Frequency of consumption of vegetables): (Never,Sometimes,Always)

NCP (Number of main meals): (Between 1 & 2,Three,More than three)

CAEC (Consumption of food between meals): (no,Sometimes,Frequently,Always)

SMOKE: (no,yes)

CH2O (Consumption of water daily): (Less than a liter,Between 1 and 2 L,More than 2 L)

SCC (Calories consumption monitoring): (no,yes)

FAF (Physical activity frequency): (I do not have,1 or 2 days,2 or 4 days,4 or 5 days)

TUE (Time using technology devices): (0–2 hours,3–5 hours,More than 5 hours)

CALC (Consumption of alcohol): (no,Sometimes,Frequently,Always)

MTRANS (Transportation used): (Automobile,Motorbike,Bike,Public Transportation,Walking)

NObeyesdad: (Insufficient Weight, Normal Weight, Overweight Level I, Overweight Level II, Obesity Type I, Obesity Type II 
and Obesity Type III) --> (Insufficient,Normal,Overweight,Obese)

In [2]:
obesity_data = pd.read_csv('./data/original_data.csv')
obesity_df = pd.DataFrame(obesity_data)
obesity_df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
obesity_rounded = obesity_df.round({'Age':0,'Height':1,'Weight':1, 'FCVC':0, 'NCP':0, 'CH2O':0, 'FAF':0, 'TUE':0})
obesity_rounded.tail()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
2106,Female,21.0,1.7,131.4,yes,yes,3.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,22.0,1.7,133.7,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,23.0,1.8,133.7,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24.0,1.7,133.3,yes,yes,3.0,3.0,Sometimes,no,3.0,no,1.0,1.0,Sometimes,Public_Transportation,Obesity_Type_III
2110,Female,24.0,1.7,133.5,yes,yes,3.0,3.0,Sometimes,no,3.0,no,1.0,1.0,Sometimes,Public_Transportation,Obesity_Type_III


In [4]:
#remove types from weight type
obesity_rounded['NObeyesdad'] = obesity_rounded['NObeyesdad'].str.split('_').str[0]
obesity_rounded['NObeyesdad'] = obesity_rounded['NObeyesdad'].str.split(' ').str[0]
obesity_rounded['NObeyesdad'].replace(to_replace='Obesity',value='Obese',inplace=True)
obesity_rounded.head(15)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.6,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal
1,Female,21.0,1.5,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight
4,Male,22.0,1.8,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight
5,Male,29.0,1.6,53.0,no,yes,2.0,3.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Automobile,Normal
6,Female,23.0,1.5,55.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,0.0,Sometimes,Motorbike,Normal
7,Male,22.0,1.6,53.0,no,no,2.0,3.0,Sometimes,no,2.0,no,3.0,0.0,Sometimes,Public_Transportation,Normal
8,Male,24.0,1.8,64.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Frequently,Public_Transportation,Normal
9,Male,22.0,1.7,68.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,no,Public_Transportation,Normal


In [5]:
#make categorical data consistent and easy to understand prior to encoding
obesity_rounded['FCVC'].replace({1.0:'Never', 2.0:'Sometimes',3.0:'Always'},inplace=True)
obesity_rounded['NCP'].replace({1.0:'Between 1 & 2', 2.0:'Three',3.0:'More than three'},inplace=True)
obesity_rounded['CH2O'].replace({1.0:'Less than a liter', 2.0:'Between 1 and 2 L',3.0:'More than 2 L'},inplace=True)
obesity_rounded['FAF'].replace({0.0:'I do not have', 1.0:'1 or 2 days', 2.0:'2 or 4 days',3.0:'4 or 5 days'},inplace=True)
obesity_rounded['TUE'].replace({0.0:'0 to 2 hours', 1.0:'3 to 5 hours',2.0:'More than 5 hours'},inplace=True)
obesity_rounded.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.6,64.0,yes,no,Sometimes,More than three,Sometimes,no,Between 1 and 2 L,no,I do not have,3 to 5 hours,no,Public_Transportation,Normal
1,Female,21.0,1.5,56.0,yes,no,Always,More than three,Sometimes,yes,More than 2 L,yes,4 or 5 days,0 to 2 hours,Sometimes,Public_Transportation,Normal
2,Male,23.0,1.8,77.0,yes,no,Sometimes,More than three,Sometimes,no,Between 1 and 2 L,no,2 or 4 days,3 to 5 hours,Frequently,Public_Transportation,Normal
3,Male,27.0,1.8,87.0,no,no,Always,More than three,Sometimes,no,Between 1 and 2 L,no,2 or 4 days,0 to 2 hours,Frequently,Walking,Overweight
4,Male,22.0,1.8,89.8,no,no,Sometimes,Between 1 & 2,Sometimes,no,Between 1 and 2 L,no,I do not have,0 to 2 hours,Sometimes,Public_Transportation,Overweight


In [6]:
# obese_dummies = pd.get_dummies(obesity_rounded)
# obese_dummies.head()

Unnamed: 0,Age,Height,Weight,Gender_Female,Gender_Male,family_history_with_overweight_no,family_history_with_overweight_yes,FAVC_no,FAVC_yes,FCVC_Always,...,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking,NObeyesdad_Insufficient,NObeyesdad_Normal,NObeyesdad_Obese,NObeyesdad_Overweight
0,21.0,1.6,64.0,1,0,0,1,1,0,0,...,1,0,0,0,1,0,0,1,0,0
1,21.0,1.5,56.0,1,0,0,1,1,0,1,...,0,0,0,0,1,0,0,1,0,0
2,23.0,1.8,77.0,0,1,0,1,1,0,0,...,0,0,0,0,1,0,0,1,0,0
3,27.0,1.8,87.0,0,1,1,0,1,0,1,...,0,0,0,0,0,1,0,0,0,1
4,22.0,1.8,89.8,0,1,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1


In [None]:
# obese_dummies.drop(columns=['Gender_Male','family_history_with_overweight_no',])

In [13]:
obesity_numeric = obesity_rounded
obesity_numeric.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.6,64.0,yes,no,Sometimes,More than three,Sometimes,no,Between 1 and 2 L,no,I do not have,3 to 5 hours,no,Public_Transportation,Normal
1,Female,21.0,1.5,56.0,yes,no,Always,More than three,Sometimes,yes,More than 2 L,yes,4 or 5 days,0 to 2 hours,Sometimes,Public_Transportation,Normal
2,Male,23.0,1.8,77.0,yes,no,Sometimes,More than three,Sometimes,no,Between 1 and 2 L,no,2 or 4 days,3 to 5 hours,Frequently,Public_Transportation,Normal
3,Male,27.0,1.8,87.0,no,no,Always,More than three,Sometimes,no,Between 1 and 2 L,no,2 or 4 days,0 to 2 hours,Frequently,Walking,Overweight
4,Male,22.0,1.8,89.8,no,no,Sometimes,Between 1 & 2,Sometimes,no,Between 1 and 2 L,no,I do not have,0 to 2 hours,Sometimes,Public_Transportation,Overweight


In [14]:
obesity_numeric['Gender'].replace({'Female':0,'Male':1},inplace=True)
obesity_numeric['family_history_with_overweight'].replace({'no':0,'yes':1},inplace=True)
obesity_numeric['FAVC'].replace({'no':0,'yes':1},inplace=True)
obesity_numeric['FCVC'].replace({'Never':0,'Sometimes':1,'Always':2},inplace=True)
obesity_numeric['NCP'].replace({'Between 1 & 2':0,'Three':1,'More than three':2},inplace=True)
obesity_numeric['CAEC'].replace({'no':0,'Sometimes':1,'Frequently':2,'Always':3},inplace=True)
obesity_numeric['SMOKE'].replace({'no':0,'yes':1},inplace=True)
obesity_numeric['CH2O'].replace({'Less than a liter':0, 'Between 1 and 2 L':1,'More than 2 L':2},inplace=True)
obesity_numeric['SCC'].replace({'no':0,'yes':1},inplace=True)
obesity_numeric['FAF'].replace({'I do not have':0, '1 or 2 days':1, '2 or 4 days':2,'4 or 5 days':3},inplace=True)
obesity_numeric['TUE'].replace({'0 to 2 hours':0, '3 to 5 hours':1,'More than 5 hours':2},inplace=True)
obesity_numeric['CALC'].replace({'no':0,'Sometimes':1,'Frequently':2,'Always':3},inplace=True)
obesity_numeric['MTRANS'].replace({'Automobile':0,'Motorbike':1,'Bike':2,'Public_Transportation':3,'Walking':4},inplace=True)
obesity_numeric['NObeyesdad'].replace({'Insufficient':0,'Normal':1,'Overweight':2,'Obese':3},inplace=True)

In [10]:
obesity_numeric.tail(10)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
2101,0,26.0,1.6,107.2,1,1,2,2.0,1,0,1,0,0,0,1,3,3
2102,0,26.0,1.6,108.1,1,1,2,2.0,1,0,1,0,0,0,1,3,3
2103,0,21.0,1.7,133.0,1,1,2,2.0,1,0,1,0,2,1,1,3,3
2104,0,22.0,1.7,133.0,1,1,2,2.0,1,0,1,0,2,1,1,3,3
2105,0,21.0,1.7,131.3,1,1,2,2.0,1,0,1,0,2,1,1,3,3
2106,0,21.0,1.7,131.4,1,1,2,2.0,1,0,1,0,2,1,1,3,3
2107,0,22.0,1.7,133.7,1,1,2,2.0,1,0,1,0,1,1,1,3,3
2108,0,23.0,1.8,133.7,1,1,2,2.0,1,0,1,0,1,1,1,3,3
2109,0,24.0,1.7,133.3,1,1,2,2.0,1,0,2,0,1,1,1,3,3
2110,0,24.0,1.7,133.5,1,1,2,2.0,1,0,2,0,1,1,1,3,3


## BEGIN ML (consider feature scaling)

In [15]:
# Assign X (data) and y (target)
X = obesity_numeric[['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight','FAVC','FCVC','NCP',
                    'CAEC','SMOKE','CH2O','SCC','FAF','TUE','CALC','MTRANS']]
y = obesity_numeric["NObeyesdad"].values.reshape(-1, 1)
print(X.shape, y.shape)

(2111, 16) (2111, 1)


In [16]:
# Split the data into training and testing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)