In [27]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import os
from datetime import datetime

# 1. Load Data and Perform Initial Preprocessing 
input_csv_path = os.path.join('..', 'data', 'raw', 'ObesityDataSet_raw_and_data_sinthetic.csv')

try:
    df = pd.read_csv(input_csv_path)
    print(f"Dataset loaded successfully from: {input_csv_path}")
except FileNotFoundError:
    print(f"ERROR: The file was not found at '{input_csv_path}'")
    exit()


Dataset loaded successfully from: ../data/raw/ObesityDataSet_raw_and_data_sinthetic.csv


In [28]:
# Create a copy for preprocessing
df_processed = df.copy()

In [29]:
# 2. Preprocessing 
# Use Ordinal Encoding for Ordered Features
caec_mapping = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}
calc_mapping = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3} 

df_processed['CAEC'] = df_processed['CAEC'].map(caec_mapping)
df_processed['CALC'] = df_processed['CALC'].map(calc_mapping)

In [30]:
# Age Categorization
bins = [0, 18, 35, 55, df['Age'].max()]
labels = ['Adolescent', 'Young Adult', 'Adult', 'Senior']
df_processed['Age_Category'] = pd.cut(df_processed['Age'], bins=bins, labels=labels, right=False)

In [31]:
# Label Encoding for the target variable 'NObeyesdad'
le = LabelEncoder()
df_processed['NObeyesdad'] = le.fit_transform(df_processed['NObeyesdad'])

In [32]:
# Encoding binary categorical features
binary_features = ['family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
for feature in binary_features:
    df_processed[feature] = df_processed[feature].map({'yes': 1, 'no': 0})

In [33]:
# One-Hot Encode remaining nominal features
# Exclude 'CAEC' and 'CALC' as they are now ordinally encoded.
categorical_features = ['Gender', 'MTRANS', 'Age_Category']
df_processed = pd.get_dummies(df_processed, columns=categorical_features, drop_first=True)

# Drop original columns that have been engineered or replaced
df_processed.drop(['Age'], axis=1, inplace=True)
print("\nInitial encoding and feature engineering complete.")
print("Preprocessed data shape:", df_processed.shape)


Initial encoding and feature engineering complete.
Preprocessed data shape: (2111, 22)


In [34]:
# 3. Split Data into Features (X) and Target (y) 
X = df_processed.drop('NObeyesdad', axis=1)
y = df_processed['NObeyesdad']

In [35]:
# 4. Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nData split into training ({len(X_train)} rows) and testing ({len(X_test)} rows) sets.")


Data split into training (1688 rows) and testing (423 rows) sets.


In [36]:
# 5. Create a Unique Directory for Output Files
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = os.path.join('..', 'data', 'preprocessed', f"preprocessed_data_{timestamp}")

os.makedirs(output_dir, exist_ok=True)
print(f"Created directory for output files: '{output_dir}/'")

Created directory for output files: '../data/preprocessed/preprocessed_data_20250720_131842/'


In [37]:
# 6. Save the Baseline (Unscaled) Datasets 

X_train.to_csv(os.path.join(output_dir, 'X_train_baseline.csv'), index=False)
X_test.to_csv(os.path.join(output_dir, 'X_test_baseline.csv'), index=False)
y_train.to_csv(os.path.join(output_dir, 'y_train_baseline.csv'), index=False)
y_test.to_csv(os.path.join(output_dir, 'y_test_baseline.csv'), index=False)
print("\nSaved baseline (unscaled) training and testing sets.")


Saved baseline (unscaled) training and testing sets.


In [40]:
# 7. Apply Scaling

# MinMaxScaler 
scaler = MinMaxScaler(feature_range=(1, 5))

print(f"\nUsing scaler: {type(scaler).__name__}")

# Identify numerical features to scale. We now include 'BMI'.
numerical_features = ['Weight', 'Height', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'CAEC', 'CALC']

# Create copies to avoid changing the original baseline dataframes
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Fit the scaler ONLY on the training data
scaler.fit(X_train_scaled[numerical_features])

# Transform both the training and testing data
X_train_scaled[numerical_features] = scaler.transform(X_train_scaled[numerical_features])
X_test_scaled[numerical_features] = scaler.transform(X_test_scaled[numerical_features])

print("Scaling applied successfully.")


Using scaler: MinMaxScaler
Scaling applied successfully.


In [41]:
# 8. Save the Final Scaled Datasets

X_train_scaled.to_csv(os.path.join(output_dir, 'X_train_scaled.csv'), index=False)
X_test_scaled.to_csv(os.path.join(output_dir, 'X_test_scaled.csv'), index=False)
y_train.to_csv(os.path.join(output_dir, 'y_train_scaled.csv'), index=False)
y_test.to_csv(os.path.join(output_dir, 'y_test_scaled.csv'), index=False)
print("Saved final scaled training and testing sets.")
print(f"\nAll files saved in: {output_dir}")

Saved final scaled training and testing sets.

All files saved in: ../data/preprocessed/preprocessed_data_20250720_131842
