# Feature Engineering for Bhutan Maternal Health Data
## Creating Predictive Features and Risk Indicators

In [None]:
import sys
sys.path.append('..')

from src.data_loader import load_maternal_health_data
from src.preprocessing import handle_missing_values, create_year_categories
from src.utils import save_processed_data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# Load and clean data
df = load_maternal_health_data(filepath='../data/raw/cleaned_maternal_health_data.csv')
df_clean = handle_missing_values(df, strategy='interpolate')
df_clean.head()

## 1. Create Time-Based Features

In [None]:
# Add period categories
df_features = create_year_categories(df_clean)

# Calculate years since baseline
df_features['Years_Since_2000'] = df_features['Year'] - 2000

# Decade indicator
df_features['Decade'] = (df_features['Year'] // 10) * 10

df_features[['Year', 'Period', 'Years_Since_2000', 'Decade']].head()

## 2. Create Health Risk Indicators

In [None]:
# Maternal Health Composite Score (higher is better)
# Normalize key indicators and create weighted average

def normalize_0_100(series):
    """Normalize series to 0-100 scale"""
    return (series - series.min()) / (series.max() - series.min()) * 100

# Positive indicators (higher is better)
df_features['Skilled_Birth_Norm'] = normalize_0_100(df_features['Births attended by skilled health personnel (%)'])
df_features['Facility_Birth_Norm'] = normalize_0_100(df_features['Proportion of births delivered in a health facility (Facility births) (%)'])
df_features['ANC_Coverage_Norm'] = normalize_0_100(df_features['Antenatal care coverage - at least four visits (%)'])

# Negative indicators (lower is better - invert)
df_features['Adolescent_Birth_Inv'] = 100 - normalize_0_100(df_features['Adolescent birth rate (per 1000 women)'])
df_features['Anaemia_Inv'] = 100 - normalize_0_100(df_features['Prevalence of anaemia in women of reproductive age (aged 15-49) (%)'])

# Composite Health Score (weighted average)
df_features['Maternal_Health_Score'] = (
    df_features['Skilled_Birth_Norm'] * 0.25 +
    df_features['Facility_Birth_Norm'] * 0.25 +
    df_features['ANC_Coverage_Norm'] * 0.20 +
    df_features['Adolescent_Birth_Inv'] * 0.15 +
    df_features['Anaemia_Inv'] * 0.15
)

# Risk categories
df_features['Risk_Level'] = pd.cut(
    df_features['Maternal_Health_Score'],
    bins=[0, 40, 70, 100],
    labels=['High Risk', 'Medium Risk', 'Low Risk']
)

print("Maternal Health Score Statistics:")
print(df_features['Maternal_Health_Score'].describe())
print("\nRisk Level Distribution:")
print(df_features['Risk_Level'].value_counts())

## 3. Create Growth Rate Features

In [None]:
# Year-over-year growth rates
key_indicators = [
    'Births attended by skilled health personnel (%)',
    'Antenatal care coverage - at least four visits (%)',
    'Adolescent birth rate (per 1000 women)',
    'Prevalence of anaemia in women of reproductive age (aged 15-49) (%)'
]

for indicator in key_indicators:
    col_name = indicator.replace(' ', '_').replace('(', '').replace(')', '').replace('%', 'pct')
    df_features[f'{col_name}_YoY_Change'] = df_features[indicator].pct_change() * 100

# Moving averages (3-year)
df_features['Skilled_Birth_MA3'] = df_features['Births attended by skilled health personnel (%)'].rolling(window=3, min_periods=1).mean()
df_features['Anaemia_MA3'] = df_features['Prevalence of anaemia in women of reproductive age (aged 15-49) (%)'].rolling(window=3, min_periods=1).mean()

df_features[['Year', 'Skilled_Birth_MA3', 'Anaemia_MA3']].tail(10)

## 4. Create Lagged Features for Forecasting

In [None]:
# Create lag features (previous year values)
lag_columns = [
    'Births attended by skilled health personnel (%)',
    'Prevalence of anaemia in women of reproductive age (aged 15-49) (%)'
]

for col in lag_columns:
    col_name = col.replace(' ', '_').replace('(', '').replace(')', '').replace('%', 'pct')
    df_features[f'{col_name}_Lag1'] = df_features[col].shift(1)
    df_features[f'{col_name}_Lag2'] = df_features[col].shift(2)

df_features.tail(10)

## 5. Interaction Features

In [None]:
# Healthcare access indicator (combination of skilled birth and facility birth)
df_features['Healthcare_Access_Index'] = (
    df_features['Births attended by skilled health personnel (%)'] + 
    df_features['Proportion of births delivered in a health facility (Facility births) (%)']
) / 2

# Maternal risk indicator (combines adolescent births and anaemia)
df_features['Maternal_Risk_Index'] = (
    normalize_0_100(df_features['Adolescent birth rate (per 1000 women)']) +
    normalize_0_100(df_features['Prevalence of anaemia in women of reproductive age (aged 15-49) (%)'])
) / 2

print("Healthcare Access Index:")
print(df_features['Healthcare_Access_Index'].describe())
print("\nMaternal Risk Index:")
print(df_features['Maternal_Risk_Index'].describe())

## 6. Visualize Engineered Features

In [None]:
# Plot Maternal Health Score trend
plt.figure(figsize=(14, 6))
plt.plot(df_features['Year'], df_features['Maternal_Health_Score'], 
         marker='o', linewidth=2, markersize=6, color='green')
plt.xlabel('Year', fontsize=12)
plt.ylabel('Maternal Health Score', fontsize=12)
plt.title('Composite Maternal Health Score Over Time (2000-2023)', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Plot Healthcare Access vs Maternal Risk
plt.figure(figsize=(10, 6))
scatter = plt.scatter(df_features['Healthcare_Access_Index'], 
                     df_features['Maternal_Risk_Index'],
                     c=df_features['Year'], cmap='viridis', s=100, alpha=0.6)
plt.colorbar(scatter, label='Year')
plt.xlabel('Healthcare Access Index', fontsize=12)
plt.ylabel('Maternal Risk Index', fontsize=12)
plt.title('Healthcare Access vs Maternal Risk Over Time', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Save Engineered Features

In [None]:
# Display final feature set
print("Total features created:", len(df_features.columns))
print("\nFeature columns:")
for col in df_features.columns:
    print(f"  - {col}")

In [None]:
# Save to processed folder
import os
os.makedirs('../data/processed', exist_ok=True)
df_features.to_csv('../data/processed/maternal_health_with_features.csv', index=False)
print("\nâœ… Feature engineering complete!")
print("Saved to: data/processed/maternal_health_with_features.csv")