In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load the dataset
try:
    df = pd.read_csv('../raw/ObesityDataSet_raw_and_data_sinthetic.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: The file 'ObesityDataSet_raw_and_data_sinthetic.csv' was not found.")
    print("Please make sure the file is in the same directory as your notebook or provide the full path.")
    exit()

Dataset loaded successfully!


In [2]:
# 1. Initial Data Exploration 
print("\nOriginal Data Info:")
df.info()

print("\nFirst 5 rows of the original dataset:")
print(df.head())

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())


Original Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF          

In [3]:
# 2. Age Categorization 
print("\nAge distribution summary:")
print(df['Age'].describe())

# Create bins for age categories
# Bins: 0-18 (Adolescent), 19-35 (Young Adult), 36-55 (Adult), 56+ (Senior)
bins = [0, 18, 35, 55, df['Age'].max()]
labels = ['Adolescent', 'Young Adult', 'Adult', 'Senior']
df['Age_Category'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)

print("\n'Age' column categorized into 'Age_Category':")
print(df[['Age', 'Age_Category']].head())


Age distribution summary:
count    2111.000000
mean       24.312600
std         6.345968
min        14.000000
25%        19.947192
50%        22.777890
75%        26.000000
max        61.000000
Name: Age, dtype: float64

'Age' column categorized into 'Age_Category':
    Age Age_Category
0  21.0  Young Adult
1  21.0  Young Adult
2  23.0  Young Adult
3  27.0  Young Adult
4  22.0  Young Adult


In [4]:
# 3. Encoding Categorical Features

# Create a copy of the dataframe for preprocessing
df_processed = df.copy()

# Label Encoding for the target variable 'NObeyesdad'
le = LabelEncoder()
df_processed['NObeyesdad'] = le.fit_transform(df_processed['NObeyesdad'])
print("\nEncoded 'NObeyesdad' classes:")
print(dict(zip(le.classes_, le.transform(le.classes_))))


# Encoding binary categorical features
binary_features = ['family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
for feature in binary_features:
    df_processed[feature] = df_processed[feature].map({'yes': 1, 'no': 0})

print("\nBinary features encoded (first 5 rows):")
print(df_processed[binary_features].head())


# One-Hot Encoding for other categorical features
# Drop the original 'Age' column and use 'Age_Category' for one-hot encoding.
categorical_features = ['Gender', 'CAEC', 'CALC', 'MTRANS', 'Age_Category']
df_processed = pd.get_dummies(df_processed, columns=categorical_features, drop_first=True)

print("\nShape of dataframe after one-hot encoding:", df_processed.shape)
print("Columns after one-hot encoding (first 5 rows):")
print(df_processed.head())

# We can drop the original 'Age' column as we have the age category now.
df_processed.drop('Age', axis=1, inplace=True)


Encoded 'NObeyesdad' classes:
{'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Obesity_Type_I': 2, 'Obesity_Type_II': 3, 'Obesity_Type_III': 4, 'Overweight_Level_I': 5, 'Overweight_Level_II': 6}

Binary features encoded (first 5 rows):
   family_history_with_overweight  FAVC  SMOKE  SCC
0                               1     0      0    0
1                               1     0      1    1
2                               1     0      0    0
3                               0     0      0    0
4                               0     0      0    0

Shape of dataframe after one-hot encoding: (2111, 27)
Columns after one-hot encoding (first 5 rows):
    Age  Height  Weight  family_history_with_overweight  FAVC  FCVC  NCP  \
0  21.0    1.62    64.0                               1     0   2.0  3.0   
1  21.0    1.52    56.0                               1     0   3.0  3.0   
2  23.0    1.80    77.0                               1     0   2.0  3.0   
3  27.0    1.80    87.0                       

In [5]:
# 4. Scaling Numerical Features 

# Identify numerical columns for scaling
# Exclude the binary features we already encoded and the target variable.
numerical_features = ['Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# Initialize MinMaxScaler
scaler = MinMaxScaler(feature_range=(1, 5))

# Scale the numerical features
df_processed[numerical_features] = scaler.fit_transform(df_processed[numerical_features])

print("\nNumerical features scaled to a range of 1-5 (first 5 rows):")
print(df_processed[numerical_features].head())


Numerical features scaled to a range of 1-5 (first 5 rows):
     Height    Weight  FCVC       NCP  CH2O       FAF  TUE
0  2.283019  1.746269   3.0  3.666667   3.0  1.000000  3.0
1  1.528302  1.507463   5.0  3.666667   5.0  5.000000  1.0
2  3.641509  2.134328   3.0  3.666667   3.0  3.666667  3.0
3  3.641509  2.432836   5.0  3.666667   3.0  3.666667  1.0
4  3.490566  2.516418   3.0  1.000000   3.0  1.000000  1.0


In [6]:
# 5. Final Preprocessed DataFrame 
print("\nFinal Preprocessed DataFrame")
print("\nFinal data info:")
df_processed.info()

print("\nFirst 5 rows of the preprocessed dataset:")
print(df_processed.head())

# Save the preprocessed data to a new CSV file
df_processed.to_csv('obesity_preprocessed.csv', index=False)
print("\nPreprocessed data saved to 'obesity_preprocessed.csv'")


Final Preprocessed DataFrame

Final data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 26 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Height                          2111 non-null   float64
 1   Weight                          2111 non-null   float64
 2   family_history_with_overweight  2111 non-null   int64  
 3   FAVC                            2111 non-null   int64  
 4   FCVC                            2111 non-null   float64
 5   NCP                             2111 non-null   float64
 6   SMOKE                           2111 non-null   int64  
 7   CH2O                            2111 non-null   float64
 8   SCC                             2111 non-null   int64  
 9   FAF                             2111 non-null   float64
 10  TUE                             2111 non-null   float64
 11  NObeyesdad                      2111 non-null  