In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

Data Dictionary:


| Column Name                     | Description                          | Data Type  |
|---------------------------------|--------------------------------------|------------|
| Gender                          | Gender of the individual, Male = 0, Female = 1 | Categorical |
| Age                             | Age of the individual                 | Continuous |
| Height                          | Height of the individual              | Continuous |
| Weight                          | Weight of the individual              | Continuous |
| family_history_with_overweight  | Family history of overweight, no = 0, yes = 1| Binary     |
| FAVC                            | Frequent consumption of high-calorie food, no = 0, yes = 1| Binary |
| FCVC                            | Frequency of vegetable consumption    | Continuous |
| NCP                             | Number of main meals per day          | Continuous |
| CAEC                            | Consumption of food between meals, no = 0, Sometimes = 1, Frequently = 2, Always = 3 | Categorical |
| SMOKE                           | Smoking habit of the individual, no = 0, yes = 1| Binary     |
| CH2O                            | Daily water consumption               | Continuous |
| SCC                             | Monitoring daily caloric intake, no = 0, yes = 1| Binary     |
| FAF                             | Physical activity frequency           | Continuous |
| TUE                             | Time spent on sedentary activities    | Continuous |
| CALC                            | Consumption of alcohol, no = 0, Sometimes = 1, Frequently = 2, Always = 3 | Categorical |
| MTRANS                          | Mode of transportation,Walking = 0, Bike = 1, Motorbike = 2, Public_Transportation = 3, Automobile = 4 | Categorical |
| NObeyesdad                      | Obesity level of the individual (PREDICTOR)      | Categorical |


Goal is to group individuals into categories based on health indicators. Nobeyesdad will be used as the ground truth labels and determine the accuracy of our modeling appraoches

In [3]:
df = pd.read_csv('obesity_data.csv')
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [4]:
# 17 columns to choose from. Dimensionality reduction will likely be needed before training a model
df.shape

(2111, 17)

In [6]:
# have some binary yes or no columns. Will need to convert those to 1 for yes and 0 for no or 1 for female and 0 for male. lets observe the unique values in each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [7]:
# observe unqiue values for each columna dn determine outlier values
def unique_values(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            print(f'{col}: {df[col].unique()}')

unique_values(df)

Gender: ['Female' 'Male']
family_history_with_overweight: ['yes' 'no']
FAVC: ['no' 'yes']
CAEC: ['Sometimes' 'Frequently' 'Always' 'no']
SMOKE: ['no' 'yes']
SCC: ['no' 'yes']
CALC: ['no' 'Sometimes' 'Frequently' 'Always']
MTRANS: ['Public_Transportation' 'Walking' 'Automobile' 'Motorbike' 'Bike']
NObeyesdad: ['Normal_Weight' 'Overweight_Level_I' 'Overweight_Level_II'
 'Obesity_Type_I' 'Insufficient_Weight' 'Obesity_Type_II'
 'Obesity_Type_III']


The following will need to be considered for this dataset:
* Binary variables will need to be mapped to integer type with 0 representing NO and 1 representing YES
* Survey response questions with categorical data types will need to be mapped to inetger type. Likert scaling will be stored in data dictionary

In [9]:
# convert binary columns to 1 for yes and 0 for no
binary_cols = ['Gender', 'FAVC', 'SMOKE', 'SCC', 'family_history_with_overweight']
for col in binary_cols:
    df[col] = df[col].apply(lambda x: 1 if x == 'yes' else 0)

df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,21.0,1.62,64.0,1,0,2.0,3.0,Sometimes,0,2.0,0,0.0,1.0,no,Public_Transportation,Normal_Weight
1,0,21.0,1.52,56.0,1,0,3.0,3.0,Sometimes,1,3.0,1,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,0,23.0,1.8,77.0,1,0,2.0,3.0,Sometimes,0,2.0,0,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,0,27.0,1.8,87.0,0,0,3.0,3.0,Sometimes,0,2.0,0,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,0,22.0,1.78,89.8,0,0,2.0,1.0,Sometimes,0,2.0,0,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [12]:
# convert likert columns
# no = 0, Sometimes = 1, Frequently = 2, Always = 3
likert_cols = ['CAEC', 'CALC']
for col in likert_cols:
    df[col] = np.where(df[col] == 'no', 0, np.where(df[col] == 'Sometimes', 1, np.where(df[col] == 'Frequently', 2, 3)))

df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,21.0,1.62,64.0,1,0,2.0,3.0,1,0,2.0,0,0.0,1.0,0,Public_Transportation,Normal_Weight
1,0,21.0,1.52,56.0,1,0,3.0,3.0,1,1,3.0,1,3.0,0.0,1,Public_Transportation,Normal_Weight
2,0,23.0,1.8,77.0,1,0,2.0,3.0,1,0,2.0,0,2.0,1.0,2,Public_Transportation,Normal_Weight
3,0,27.0,1.8,87.0,0,0,3.0,3.0,1,0,2.0,0,2.0,0.0,2,Walking,Overweight_Level_I
4,0,22.0,1.78,89.8,0,0,2.0,1.0,1,0,2.0,0,0.0,0.0,1,Public_Transportation,Overweight_Level_II
