# Machine Learning Model - Maternal Health Risk Prediction
## Predicting Future Health Indicators and Risk Levels

In [1]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib
import os

%matplotlib inline

In [2]:
# Load engineered features
df = pd.read_csv('../data/processed/maternal_health_with_features.csv')
print(f"Dataset shape: {df.shape}")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/maternal_health_with_features.csv'

## Model 1: Risk Level Classification

In [None]:
# Prepare data for classification
# Drop rows with missing Risk_Level
df_class = df.dropna(subset=['Risk_Level']).copy()

# Select features for classification
feature_cols = [
    'Years_Since_2000',
    'Births attended by skilled health personnel (%)',
    'Antenatal care coverage - at least four visits (%)',
    'Adolescent birth rate (per 1000 women)',
    'Prevalence of anaemia in women of reproductive age (aged 15-49) (%)',
    'Healthcare_Access_Index',
    'Maternal_Risk_Index',
    'Skilled_Birth_MA3',
    'Anaemia_MA3'
]

# Remove any rows with NaN in feature columns
df_class = df_class.dropna(subset=feature_cols)

X = df_class[feature_cols]
y = df_class['Risk_Level']

print(f"Features shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Train Random Forest Classifier
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced'
)

rf_classifier.fit(X_train, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=rf_classifier.classes_,
            yticklabels=rf_classifier.classes_)
plt.title('Confusion Matrix - Risk Level Prediction', fontsize=14, fontweight='bold')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf_classifier.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance - Risk Classification', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nFeature Importance:")
print(feature_importance)

## Model 2: Skilled Birth Attendance Prediction (Regression)

In [None]:
# Prepare data for regression
df_reg = df.dropna(subset=['Births attended by skilled health personnel (%)']).copy()

regression_features = [
    'Years_Since_2000',
    'Antenatal care coverage - at least four visits (%)',
    'Adolescent birth rate (per 1000 women)',
    'Prevalence of anaemia in women of reproductive age (aged 15-49) (%)',
    'Healthcare_Access_Index'
]

df_reg = df_reg.dropna(subset=regression_features)

X_reg = df_reg[regression_features]
y_reg = df_reg['Births attended by skilled health personnel (%)']

print(f"Regression dataset shape: {X_reg.shape}")

In [None]:
# Split data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Train Random Forest Regressor
rf_regressor = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42
)

rf_regressor.fit(X_train_reg, y_train_reg)

# Predictions
y_pred_reg = rf_regressor.predict(X_test_reg)

# Evaluation
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

In [None]:
# Plot Actual vs Predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_test_reg, y_pred_reg, alpha=0.6, s=100)
plt.plot([y_test_reg.min(), y_test_reg.max()], 
         [y_test_reg.min(), y_test_reg.max()], 
         'r--', linewidth=2, label='Perfect Prediction')
plt.xlabel('Actual Skilled Birth Attendance (%)', fontsize=12)
plt.ylabel('Predicted Skilled Birth Attendance (%)', fontsize=12)
plt.title('Actual vs Predicted - Skilled Birth Attendance', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Save Models

In [None]:
# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save models
joblib.dump(rf_classifier, '../models/risk_classifier.pkl')
joblib.dump(rf_regressor, '../models/skilled_birth_regressor.pkl')

# Save feature names for later use
joblib.dump(feature_cols, '../models/classification_features.pkl')
joblib.dump(regression_features, '../models/regression_features.pkl')

print("✅ Models saved successfully!")
print("  - models/risk_classifier.pkl")
print("  - models/skilled_birth_regressor.pkl")
print("  - models/classification_features.pkl")
print("  - models/regression_features.pkl")

## Test Model Loading

In [None]:
# Test loading models
loaded_classifier = joblib.load('../models/risk_classifier.pkl')
loaded_regressor = joblib.load('../models/skilled_birth_regressor.pkl')

print("✅ Models loaded successfully!")
print(f"Classifier type: {type(loaded_classifier)}")
print(f"Regressor type: {type(loaded_regressor)}")