# Estimation of Obesity Levels Based On Eating Habits and Physical Conditions
Key Features: gender, age, height, weight, eating habits, physical conditon, lifestyle habits \
Source: https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition \
The estimation of obesity levels in people from the countries of Mexico, Peru and Colombia, with ages between 14 and 61

In [7]:
### Verify all libraries are installed in your machine first. 

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 

# Other libraries that may be useful in the future. 
# import duckdb # for using sql syntax to access a df 
# import pandasql # also for using sql to access df - have slightly different features /advantages"
# import pil # python imaging library for opening, manipulating and saving images

## Functions that may be useful. 
## %%sql

In [9]:
df = pd.read_csv("DataSets/UCI_ObesityDataSet.csv")
df.head()


Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [10]:
# Rename labels for easy readability and intuitive data access. 
df = df.rename(columns = {'Gender':'gender', 'Age':'age', 'Height':'height', 'Weight':'Weight','FAVC':'eat_high_calorie_frequently', 
                          'FCVC':'eat_vegetables_usually', 'NCP':'daily_meal_count', 'CAEC':'snack_between_meals', 'SMOKE':'smoke', 
                          'CH2O':'daily_water_intake', 'SCC':'monitor_calories', 'FAF':'activity_frequency', 'TUE':'time_on_technology', 
                          'CALC':'alcohol_frequency','MTRANS':'transportation', 'NObeyesdad':'obesity_level'})
df.head()

Unnamed: 0,gender,age,height,Weight,family_history_with_overweight,eat_high_calorie_frequently,eat_vegetables_usually,daily_meal_count,snack_between_meals,smoke,daily_water_intake,monitor_calories,activity_frequency,time_on_technology,alcohol_frequency,transportation,obesity_level
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [11]:
cell_count = df.size
row_count = df.shape[0]
column_count = df.shape[1]

print(cell_count)
print(row_count)
print(column_count)

35887
2111
17


In [None]:
df.describe()

Unnamed: 0,age,height,Weight,eat_vegetables_usually,daily_meal_count,daily_water_intake,activity_frequency,time_on_technology
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


# Relative Risk 
* Calculate RR of being obese if female vs male
* RR of of being obese if counting calories vs not
* RR of being obese if drinking alcohol vs not
* RR of walking vs not 

In [None]:
target_obesity_levels = ['Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III']

male_obese = df[(df['gender'] == 'Male') & (df['obesity_level'].isin(target_obesity_levels))] #creates new df that contains only males with obesity

female_obese = df[(df['gender'] == 'Female') & (df['obesity_level'].isin(target_obesity_levels))] #creates new df that contains only females with obesity

totalMales = df[(df['gender'] == 'Male')] #creates a new df with only males

totalFemales = df[(df['gender'] == 'Female')] #creates a new df with only females 

count_male_obese = len(male_obese) #total males with obesity 
print(count_male_obese)

count_female_obese = len(female_obese) #total males with obesity 
print(count_female_obese) #total females with obesity

print(len(df)) #total number of individuals in set

riskMale = count_male_obese / len(totalMales) #risk of obesity in males
print(riskMale)

riskFemale = count_female_obese / len(totalFemales) #risk of obesity in females
print(riskFemale)

#Risk of obesity among males and females are about the same


491
481
2111
0.4597378277153558
0.46116970278044106


# Chi-Squared Tests
* Independence - can check independence for: (gender, obesity), (water, obesity), (water, meal_count), (alcohol, obesity)
* Goodness of fit ? probably not 
* Homogeneity ? probably not 

# Make Correlation Table to find residuals