In [1]:
# PROJECT PROGRESS 2: DATA WRANGLING
# Requirements: Import, Clean, Format, Binning, Normalization, Indicator Variables

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [3]:
# 1. IMPORT DATA
df = pd.read_csv('C:\\Python\\ObesityDataSet_raw_and_data_sinthetic.csv')
print(f"Original Data Shape: {df.shape}")
print(df.head())

Original Data Shape: (2111, 17)
    Age  Gender  Height  Weight        CALC FAVC  FCVC  NCP  SCC SMOKE  CH2O  \
0  21.0  Female    1.62    64.0          no   no   2.0  3.0   no    no   2.0   
1  21.0  Female    1.52    56.0   Sometimes   no   3.0  3.0  yes   yes   3.0   
2  23.0    Male    1.80    77.0  Frequently   no   2.0  3.0   no    no   2.0   
3  27.0    Male    1.80    87.0  Frequently   no   3.0  3.0   no    no   2.0   
4  22.0    Male    1.78    89.8   Sometimes   no   2.0  1.0   no    no   2.0   

  family_history_with_overweight  FAF  TUE       CAEC                 MTRANS  \
0                            yes  0.0  1.0  Sometimes  Public_Transportation   
1                            yes  3.0  0.0  Sometimes  Public_Transportation   
2                            yes  2.0  1.0  Sometimes  Public_Transportation   
3                             no  2.0  0.0  Sometimes                Walking   
4                             no  0.0  0.0  Sometimes  Public_Transportation   

      

In [4]:
# 2. HANDLING MISSING VALUES
# Check for nulls
missing_count = df.isnull().sum()
print(missing_count)
missing_count = missing_count.sum()
if missing_count > 0:
    df = df.dropna()
    print(f"Removed {missing_count} missing values.")
else:
    print("No missing values found. Dataset is clean.")

Age                               0
Gender                            0
Height                            0
Weight                            0
CALC                              0
FAVC                              0
FCVC                              0
NCP                               0
SCC                               0
SMOKE                             0
CH2O                              0
family_history_with_overweight    0
FAF                               0
TUE                               0
CAEC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64
No missing values found. Dataset is clean.


In [5]:
# 3. DATA FORMATTING (Renaming for clarity)
df.rename(columns={'FAVC': 'HighCaloricFood', 'NObeyesdad': 'ObesityLevel'}, inplace=True)

In [6]:
# 4. BINNING (Requirement: Grouping Continuous Data)
# Binning Physical Activity (FAF)
# use -1 as the start to ensure 0.0 is included safely
bins_faf = [-1, 0.9, 1.9, 3.0]
labels_faf = ['Sedentary', 'Moderate', 'Active']
df['Activity_Level'] = pd.cut(df['FAF'], bins=bins_faf, labels=labels_faf)
print("Created 'Activity_Level' (Sedentary, Moderate, Active)")

# Binning Technology Usage (TUE)
bins_tue = [-1, 0.9, 3.0] # 0 to 0.9 is Low, 1.0+ is High
labels_tue = ['Low_Tech_Use', 'High_Tech_Use']
df['Tech_Usage'] = pd.cut(df['TUE'], bins=bins_tue, labels=labels_tue)
print("Created 'Tech_Usage' (Low_Tech_Use, High_Tech_Use)")

# Check the result
print(df[['FAF', 'Activity_Level', 'TUE', 'Tech_Usage']].head())

Created 'Activity_Level' (Sedentary, Moderate, Active)
Created 'Tech_Usage' (Low_Tech_Use, High_Tech_Use)
   FAF Activity_Level  TUE     Tech_Usage
0  0.0      Sedentary  1.0  High_Tech_Use
1  3.0         Active  0.0   Low_Tech_Use
2  2.0         Active  1.0  High_Tech_Use
3  2.0         Active  0.0   Low_Tech_Use
4  0.0      Sedentary  0.0   Low_Tech_Use


In [7]:
# NORMALIZATION (Requirement: Centering and Scaling)
scaler = MinMaxScaler()
cols_to_scale = ['Age', 'Height', 'Weight']

# We create new columns with the suffix '_Scaled'
df[[f'{col}_Scaled' for col in cols_to_scale]] = scaler.fit_transform(df[cols_to_scale])

print("Scaled Height, Weight, and Age to 0-1 range. Example:")
print(df[['Age', 'Age_Scaled', 'Weight', 'Weight_Scaled']].head())

Scaled Height, Weight, and Age to 0-1 range. Example:
    Age  Age_Scaled  Weight  Weight_Scaled
0  21.0    0.148936    64.0       0.186567
1  21.0    0.148936    56.0       0.126866
2  23.0    0.191489    77.0       0.283582
3  27.0    0.276596    87.0       0.358209
4  22.0    0.170213    89.8       0.379104


In [8]:
# INDICATOR VARIABLES (Requirement: Encoding Text to Numbers)
# The computer can't read "Male" or "Female". It needs 0 or 1.
le = LabelEncoder()

# Create a copy for training later
df_processed = df.copy()
categorical_cols = df_processed.select_dtypes(include=['object', 'category']).columns

for col in categorical_cols:
    df_processed[col] = le.fit_transform(df_processed[col].astype(str))

print("Converted all text columns to numbers.")
print(df_processed[['Gender', 'ObesityLevel']].head())

Converted all text columns to numbers.
   Gender  ObesityLevel
0       0             1
1       0             1
2       1             1
3       1             5
4       1             6
