# Data Preprocessing

Handle missing values, encode categorical features, scale numerical features, and engineer new features.

### Step 1: Load Dataset from Previous Notebook

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv('data/dataset_loaded.csv')

print(f'Dataset loaded: {df.shape[0]} rows × {df.shape[1]} columns')
print(f'\nFirst few records:')
print(df.head())

### Step 2: Handle Missing Values

In [None]:
print('═' * 60)
print('HANDLING MISSING VALUES')
print('═' * 60)

df_processed = df.copy()

# Strategy for each missing value pattern
# Alcohol Consumption: Create 'None' category for missing
print('Filling Alcohol Consumption with "None"...')
df_processed['Alcohol Consumption'].fillna('None', inplace=True)

# Medical Conditions: Create 'None' category for missing
print('Filling Medical Conditions with "None"...')
df_processed['Medical Conditions'].fillna('None', inplace=True)

# Medications: Create 'None' category for missing
print('Filling Medications with "None"...')
df_processed['Medications'].fillna('None', inplace=True)

print('\n✓ All missing values handled!')
print(f'Total missing values remaining: {df_processed.isnull().sum().sum()}')

### Step 3: Binary Feature Encoding

In [None]:
print('═' * 60)
print('BINARY FEATURE ENCODING')
print('═' * 60)

df_encoded = df_processed.copy()

# Gender encoding
df_encoded['Gender'] = df_encoded['Gender'].map({'Male': 0, 'Female': 1})
print('Gender: Male=0, Female=1')

# Hormonal Changes
df_encoded['Hormonal Changes'] = df_encoded['Hormonal Changes'].map({'Normal': 0, 'Postmenopausal': 1})
print('Hormonal Changes: Normal=0, Postmenopausal=1')

# Body Weight
df_encoded['Body Weight'] = df_encoded['Body Weight'].map({'Normal': 0, 'Underweight': 1})
print('Body Weight: Normal=0, Underweight=1')

# Calcium Intake
df_encoded['Calcium Intake'] = df_encoded['Calcium Intake'].map({'Adequate': 0, 'Low': 1})
print('Calcium Intake: Adequate=0, Low=1')

# Vitamin D Intake
df_encoded['Vitamin D Intake'] = df_encoded['Vitamin D Intake'].map({'Sufficient': 0, 'Insufficient': 1})
print('Vitamin D Intake: Sufficient=0, Insufficient=1')

# Physical Activity
df_encoded['Physical Activity'] = df_encoded['Physical Activity'].map({'Active': 0, 'Sedentary': 1})
print('Physical Activity: Active=0, Sedentary=1')

# Smoking
df_encoded['Smoking'] = df_encoded['Smoking'].map({'No': 0, 'Yes': 1})
print('Smoking: No=0, Yes=1')

# Prior Fractures
df_encoded['Prior Fractures'] = df_encoded['Prior Fractures'].map({'No': 0, 'Yes': 1})
print('Prior Fractures: No=0, Yes=1')

# Family History
df_encoded['Family History'] = df_encoded['Family History'].map({'No': 0, 'Yes': 1})
print('Family History: No=0, Yes=1')

# Alcohol Consumption
df_encoded['Alcohol Consumption'] = df_encoded['Alcohol Consumption'].map({'None': 0, 'Moderate': 1})
print('Alcohol Consumption: None=0, Moderate=1')

# Medications
df_encoded['Medications'] = df_encoded['Medications'].map({'None': 0, 'Corticosteroids': 1})
print('Medications: None=0, Corticosteroids=1')

print('\n✓ Binary features encoded successfully!')

### Step 4: Multi-Category Feature Encoding (One-Hot Encoding)

In [None]:
print('═' * 60)
print('MULTI-CATEGORY FEATURE ENCODING')
print('═' * 60)

# One-hot encode Race/Ethnicity
print('\nEncoding Race/Ethnicity...')
race_dummies = pd.get_dummies(df_encoded['Race/Ethnicity'], prefix='Race')
df_encoded = pd.concat([df_encoded, race_dummies], axis=1)
df_encoded.drop('Race/Ethnicity', axis=1, inplace=True)
print(f'Created features: {list(race_dummies.columns)}')

# One-hot encode Medical Conditions
print('\nEncoding Medical Conditions...')
conditions_dummies = pd.get_dummies(df_encoded['Medical Conditions'], prefix='Condition')
df_encoded = pd.concat([df_encoded, conditions_dummies], axis=1)
df_encoded.drop('Medical Conditions', axis=1, inplace=True)
print(f'Created features: {list(conditions_dummies.columns)}')

print('\n✓ Multi-category features encoded successfully!')
print(f'Total features after encoding: {df_encoded.shape[1] - 2}')  # Excluding ID and target

### Step 5: Feature Scaling

In [None]:
print('═' * 60)
print('FEATURE SCALING')
print('═' * 60)

scaler = StandardScaler()

# Scale Age feature
print('\nScaling Age feature using StandardScaler...')
original_age_mean = df_encoded['Age'].mean()
original_age_std = df_encoded['Age'].std()

df_encoded['Age'] = scaler.fit_transform(df_encoded[['Age']])

print(f'Original Age - Mean: {original_age_mean:.2f}, Std: {original_age_std:.2f}')
print(f'Scaled Age - Mean: {df_encoded["Age"].mean():.6f}, Std: {df_encoded["Age"].std():.6f}')
print('\n✓ Feature scaling complete!')

# Save scaler for later use
import joblib
joblib.dump(scaler, 'models/age_scaler.pkl')
print('✓ Scaler saved for model deployment!')

### Step 6: Feature Engineering

In [None]:
print('═' * 60)
print('FEATURE ENGINEERING')
print('═' * 60)

# Create interaction features
print('\nCreating interaction features...')

# Age × Hormonal Changes (critical for bone health)
df_encoded['Age_x_Hormonal'] = df_encoded['Age'] * df_encoded['Hormonal Changes']
print('Created: Age × Hormonal Changes')

# Calcium × Vitamin D (synergistic effect)
df_encoded['Calcium_x_VitaminD'] = df_encoded['Calcium Intake'] * df_encoded['Vitamin D Intake']
print('Created: Calcium Intake × Vitamin D Intake')

# Physical Activity × Smoking (lifestyle interaction)
df_encoded['Activity_x_Smoking'] = df_encoded['Physical Activity'] * df_encoded['Smoking']
print('Created: Physical Activity × Smoking')

print('\n✓ Feature engineering complete!')
print(f'Total features after engineering: {df_encoded.shape[1] - 2}')

### Step 7: Display Preprocessed Features

In [None]:
print('═' * 60)
print('PREPROCESSED FEATURE LIST')
print('═' * 60)

feature_cols = [col for col in df_encoded.columns if col not in ['Id', 'Osteoporosis']]

for i, col in enumerate(feature_cols, 1):
    print(f'{i:2d}. {col}')

print(f'\nTotal Features: {len(feature_cols)}')

### Step 8: Save Preprocessed Data

In [None]:
# Save preprocessed data
df_encoded.to_csv('data/preprocessed_data.csv', index=False)
print('✓ Preprocessed data saved!')

print('\n' + '═' * 60)
print('Data preprocessing complete!')
print('Ready to proceed to Model Training notebook.')
print('═' * 60)

### Next Notebook

Proceed to **04_Model_Training.ipynb** to train gender-specific XGBoost models.