In [1]:
import pandas as pd
import numpy as np

# File Path
dhs_path = "data_private/dhs/india/raw/IAKR7EFL.DTA"

# Feature Mapping (New Name: DHS Original Name)
FEATURE_COLUMNS = {
    "fever": "h22",                 # Fever in last 2 weeks
    "age_months": "b19",           # Child age in months
    "state": "v024",              # State
    "residence_type": "v025",     # Urban / Rural
    "slept_under_net": "ml0",     # ITN usage
    "anemia_level": "hw57",       # Anemia severity (proxy risk)
    "interview_month": "v006"     # Seasonality signal
}

chunk_size = 50000

In [2]:
chunks = []

# Use only required columns to save memory
required_cols = list(FEATURE_COLUMNS.values())
rename_map = {v: k for k, v in FEATURE_COLUMNS.items()}

print(f"Reading {dhs_path} in chunks of {chunk_size}...")

try:
    with pd.read_stata(
        dhs_path,
        columns=required_cols,
        convert_categoricals=False,
        iterator=True,
        chunksize=chunk_size
    ) as reader:
        
        for i, chunk in enumerate(reader):
            # Rename columns to friendly names
            chunk = chunk.rename(columns=rename_map)
            
            # Drop rows where ALL symptom indicators are missing (NaN)
            # Updated Symptoms: fever only, as current_fever and convulsions are removed
            symptoms = ["fever"]
            chunk = chunk.dropna(subset=symptoms, how='all')
            
            chunks.append(chunk)
            print(f"Processed Chunk {i+1}: {chunk.shape[0]} rows kept")

    # Concatenate all chunks
    symptom_df = pd.concat(chunks, ignore_index=True)

    print("\n--- Processing Complete ---")
    print(f"Final Data Shape: {symptom_df.shape}")

    print("\n% Missing Values per Column:")
    print((symptom_df.isnull().sum() / len(symptom_df) * 100).round(2))

    print("\nValue Counts: Fever (Last 2 Weeks)")
    print(symptom_df['fever'].value_counts(dropna=False))

except FileNotFoundError:
    print(f"Error: The file {dhs_path} was not found.")
except ValueError as ve:
    print(f"ValueError: {ve}")
except Exception as e:
    import traceback
    traceback.print_exc()

In [3]:
# Copy to avoid modifying original extraction
risk_df = symptom_df.copy()

# 1. Clean DHS Keys (8 = Don't Know -> NaN)
risk_df['fever'] = risk_df['fever'].replace(8, np.nan)
risk_df['slept_under_net'] = risk_df['slept_under_net'].replace(8, np.nan)

# 2. Rule-Based Labeling
# Default Risk = 0 (Low)
risk_df['malaria_risk'] = 0

# Helpers masks
has_fever = (risk_df['fever'] == 1)
used_net = (risk_df['slept_under_net'] == 1)
no_net = (risk_df['slept_under_net'] == 0) | (risk_df['slept_under_net'].isna())

# IMPORTANT: DHS Anemia Levels (hw57)
# 1 = Severe, 2 = Moderate, 3 = Mild, 4 = Not Anemic
# We target Moderate/Severe (1 or 2)
severe_anemia = risk_df['anemia_level'].isin([1, 2])

# Rule: Fever == 1 AND Net == 1 -> Medium Risk (1)
risk_df.loc[has_fever & used_net, 'malaria_risk'] = 1

# Rule: Fever == 1 AND No Net (or missing) -> High Risk (2)
risk_df.loc[has_fever & no_net, 'malaria_risk'] = 2

# Rule: Fever == 1 AND Moderate/Severe Anemia -> Force High Risk (2)
risk_df.loc[has_fever & severe_anemia, 'malaria_risk'] = 2

# 3. Validation Output
print("Malaria Risk Distribution:")
print(risk_df['malaria_risk'].value_counts(dropna=False).sort_index())

print("\nCross-tab: Fever vs Malaria Risk")
print(pd.crosstab(risk_df['fever'].fillna("Missing"), risk_df['malaria_risk']))

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# --- 1. Data Cleaning & Encoding ---
model_df = risk_df.copy()

# Clean '8' (Don't Know) in other features if present (anemia explicitly mentioned in prompt)
# DHS Anemia (hw57) usually doesn't have 8, but we'll safeguard.
model_df['anemia_level'] = model_df['anemia_level'].replace(8, np.nan)

# Impute Missing Values
# We strictly replace missing values with a placeholder (-1) so the model sees them.
# Dropping rows might lose 'High Risk' cases where features were missing but rule applied.
imputer = SimpleImputer(strategy='constant', fill_value=-1)
cols_to_impute = ['fever', 'age_months', 'slept_under_net', 'anemia_level', 'interview_month']
model_df[cols_to_impute] = imputer.fit_transform(model_df[cols_to_impute])

# Encode Categoricals (State, Residence)
# Ensure string type for LabelEncoder
le_state = LabelEncoder()
model_df['state'] = le_state.fit_transform(model_df['state'].astype(str))

le_res = LabelEncoder()
model_df['residence_type'] = le_res.fit_transform(model_df['residence_type'].astype(str))

# --- 2. Split Data ---
X = model_df[['fever', 'age_months', 'state', 'residence_type', 'slept_under_net', 'anemia_level', 'interview_month']]
y = model_df['malaria_risk']

# Stratified 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# --- 3. Train Model ---
print("Training RandomForestClassifier...")
rf_model = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# --- 4. Evaluate ---
y_pred = rf_model.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Low', 'Medium', 'High']))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# --- 5. Feature Importance ---
print("\nFeature Importance Ranking:")
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(importance_df)

In [5]:
import joblib
import os

# Create directory if it doesn't exist
os.makedirs("apps/inference/models", exist_ok=True)

# Bundle the model and preprocessors
model_bundle = {
    "model": rf_model,
    "imputer": imputer,
    "le_state": le_state,
    "le_res": le_res,
    "features": ['fever', 'age_months', 'state', 'residence_type', 'slept_under_net', 'anemia_level', 'interview_month'],
    "cols_to_impute": cols_to_impute
}

save_path = "apps/inference/models/malaria_symptoms_dhs.pkl"
joblib.dump(model_bundle, save_path)

print(f"✅ Model and preprocessors saved to {save_path}")

✅ Model and preprocessors saved to apps/inference/models/malaria_symptoms_dhs.pkl
