In [2]:
# EDA
import pandas as pd
import statsmodels.api as sm
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
# import sweetviz as sv

# ML
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, recall_score

# Hyperparameter optimization
import optuna

In [3]:
# Data load

df_obesity = pd.read_csv("./datasets/ObesityDataSet_raw_and_data_sinthetic.csv")

In [39]:
df_obesity.head(50)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,age_group,age_group_string
0,Female,21,1.62,64.0,yes,no,2,3,Sometimes,no,2,no,0,1,no,Public_Transportation,Normal_Weight,1,20-30
1,Female,21,1.52,56.0,yes,no,3,3,Sometimes,yes,3,yes,3,0,Sometimes,Public_Transportation,Normal_Weight,1,20-30
2,Male,23,1.8,77.0,yes,no,2,3,Sometimes,no,2,no,2,1,Frequently,Public_Transportation,Normal_Weight,1,20-30
3,Male,27,1.8,87.0,no,no,3,3,Sometimes,no,2,no,2,0,Frequently,Walking,Overweight_Level_I,1,20-30
4,Male,22,1.78,89.8,no,no,2,1,Sometimes,no,2,no,0,0,Sometimes,Public_Transportation,Overweight_Level_II,1,20-30
5,Male,29,1.62,53.0,no,yes,2,3,Sometimes,no,2,no,0,0,Sometimes,Automobile,Normal_Weight,1,20-30
6,Female,23,1.5,55.0,yes,yes,3,3,Sometimes,no,2,no,1,0,Sometimes,Motorbike,Normal_Weight,1,20-30
7,Male,22,1.64,53.0,no,no,2,3,Sometimes,no,2,no,3,0,Sometimes,Public_Transportation,Normal_Weight,1,20-30
8,Male,24,1.78,64.0,yes,yes,3,3,Sometimes,no,2,no,1,1,Frequently,Public_Transportation,Normal_Weight,1,20-30
9,Male,22,1.72,68.0,yes,yes,2,3,Sometimes,no,2,no,1,1,no,Public_Transportation,Normal_Weight,1,20-30


In [6]:
df_obesity.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [37]:
df_obesity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   Gender                          2111 non-null   object  
 1   Age                             2111 non-null   int64   
 2   Height                          2111 non-null   float64 
 3   Weight                          2111 non-null   float64 
 4   family_history_with_overweight  2111 non-null   object  
 5   FAVC                            2111 non-null   object  
 6   FCVC                            2111 non-null   int64   
 7   NCP                             2111 non-null   int64   
 8   CAEC                            2111 non-null   object  
 9   SMOKE                           2111 non-null   object  
 10  CH2O                            2111 non-null   int64   
 11  SCC                             2111 non-null   object  
 12  FAF                 

In [14]:
# Change coluns to type int

coluns = ["Age", "FCVC", "NCP", "CH2O", "FAF", "TUE"]

df_obesity[coluns] = df_obesity[coluns].astype(int)

In [18]:
df_obesity.NObeyesdad.value_counts()

NObeyesdad
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: count, dtype: int64

### EDA

In [20]:
# Distribution of the target variable

px.bar(df_obesity.value_counts("NObeyesdad"))

In [21]:
px.box(df_obesity, y="Age")

In [24]:
# univariate analysis

px.bar(df_obesity.value_counts("Gender"))

In [28]:
df_obesity.Age.describe()

count    2111.000000
mean       23.972525
std         6.308664
min        14.000000
25%        19.000000
50%        22.000000
75%        26.000000
max        61.000000
Name: Age, dtype: float64

In [31]:
# Age bucketing 

bins = [10, 20, 30, 40, 50, 60, 70]
bins_ordinal = [0, 1, 2, 3, 4, 5]
age_group_labels = ['10-20', '20-30', '30-40', '40-50', '50-60', '60-70']
df_obesity['age_group_string'] = pd.cut(x = df_obesity["Age"], bins = bins, labels=age_group_labels, include_lowest = True)
df_obesity['age_group'] = pd.cut(x = df_obesity["Age"], bins = bins, labels=bins_ordinal, include_lowest = True)

In [35]:
# Contingenny table
age_group_contigency_table = sm.stats.Table.from_data(df_obesity[["NObeyesdad", "age_group"]])

<statsmodels.stats.contingency_tables.Table at 0x172a71d6810>

In [40]:
age_group_contigency_table.table_orig

age_group,0,1,2,3,4,5
NObeyesdad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Insufficient_Weight,198,71,3,0,0,0
Normal_Weight,132,137,16,0,1,1
Obesity_Type_I,92,177,66,15,1,0
Obesity_Type_II,8,224,61,4,0,0
Obesity_Type_III,76,248,0,0,0,0
Overweight_Level_I,99,151,37,2,1,0
Overweight_Level_II,69,130,75,11,5,0


In [41]:
df_obesity.corr()

ValueError: could not convert string to float: 'Female'

In [None]:
# A fazer: onehotenconding da variavel obesidade, mudar genero e outras variaveis categoricas para binario.