In [99]:
# %%
# Preprocess the balanced synthetic risk dataset
# - Data cleaning & clipping
# - One-hot encoding: weather_condition
# - Label encoding: risk_label (LOW→0, MODERATE→1, HIGH→2, CRITICAL→3)
# - Train/test split
# - Impute + scale numerics
# - Save artifacts

import os
from typing import Dict, Tuple, List, Any

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [100]:
# ---------- Paths ----------
INPUT_CSV  = "synthetic_risk_dataset_balanced_10k.csv"  # change if needed

In [101]:
# ---------- Schema ----------
NUMERIC_COLS: List[str] = [
    "hr_bpm",
    "spo2_pct",
    "skin_temp",
    "bloodpressure_systolic",
    "bp_diastolic",
    "altitude",
    "latitude",
    "longitude",
    "steps",
    "past_incident_flag",  # keep as numeric (0/1)
]
CAT_COL: str = "weather_condition"
TARGET_COL: str = "risk_label"

In [102]:

# Ordered mapping for labels
RISK_ORDER: Dict[str, int] = {
    "LOW": 0,
    "MODERATE": 1,
    "HIGH": 2,
    "CRITICAL": 3,
}

In [103]:
# Sanity bounds for clipping
BOUNDS: Dict[str, Tuple[float, float]] = {
    "hr_bpm": (30, 220),
    "spo2_pct": (60, 100),
    "skin_temp": (30, 45),
    "bloodpressure_systolic": (70, 250),
    "bp_diastolic": (40, 150),
    "altitude": (-430, 9000),
    "latitude": (-90, 90),
    "longitude": (-180, 180),
    "steps": (0, 100000),
    "past_incident_flag": (0, 1),
}

In [104]:
# ---------- Cleaning helpers ----------
def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out.columns = [c.strip() for c in out.columns]
    return out

def basic_type_cast(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in NUMERIC_COLS:
        if c in out.columns:
            out[c] = pd.to_numeric(out[c], errors="coerce")
    if CAT_COL in out.columns:
        out[CAT_COL] = out[CAT_COL].astype(str).str.strip()
    if TARGET_COL in out.columns:
        out[TARGET_COL] = out[TARGET_COL].astype(str).str.strip().str.upper()
    return out

def clip_sanity_bounds(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col, (lo, hi) in BOUNDS.items():
        if col in out.columns:
            out[col] = out[col].clip(lower=lo, upper=hi)
    return out

def norm_lab(x: Any) -> str:
    s = str(x).strip().upper()
    if s in {"LOW", "NORMAL", "SAFE"}: return "LOW"
    if s in {"MODERATE", "MEDIUM"}:    return "MODERATE"
    if s in {"HIGH", "RISKY"}:         return "HIGH"
    if s in {"CRITICAL", "SEVERE", "VERY HIGH"}: return "CRITICAL"
    return s

In [105]:
# ---------- Load & clean ----------
df = pd.read_csv(INPUT_CSV)
df = normalize_column_names(df)
df = basic_type_cast(df)
df = df.drop_duplicates().reset_index(drop=True)
df = clip_sanity_bounds(df)
df[TARGET_COL] = df[TARGET_COL].map(norm_lab)
print(df.head())

   user_id  hr_bpm  spo2_pct  skin_temp  bloodpressure_systolic  bp_diastolic  \
0     6253     108      86.5       39.3                     158           115   
1     4685     108      92.6       38.0                     124            88   
2     1732      68     100.0       34.4                     118            92   
3     4743      97      92.0       34.8                     134            89   
4     4522      93      96.3       35.9                     145            82   

   altitude  latitude  longitude  steps  past_incident_flag weather_condition  \
0      3522  28.41182   85.67296   1234                   0              Rain   
1       528  28.97789   87.50397   5824                   0               Hot   
2       543  26.51370   80.27189  10350                   0              Cold   
3      2171  29.70053   85.87051   5980                   0              Rain   
4      1330  28.55247   80.35179   6773                   0              Rain   

  risk_label  
0       HIG

In [106]:
# Remove rows with missing target after normalization
df = df[~df[TARGET_COL].isna() & (df[TARGET_COL] != "")].reset_index(drop=True)

print("Head after cleaning:")
display(df.head())
print("NA counts:")
print(df.isna().sum())

Head after cleaning:


Unnamed: 0,user_id,hr_bpm,spo2_pct,skin_temp,bloodpressure_systolic,bp_diastolic,altitude,latitude,longitude,steps,past_incident_flag,weather_condition,risk_label
0,6253,108,86.5,39.3,158,115,3522,28.41182,85.67296,1234,0,Rain,HIGH
1,4685,108,92.6,38.0,124,88,528,28.97789,87.50397,5824,0,Hot,MODERATE
2,1732,68,100.0,34.4,118,92,543,26.5137,80.27189,10350,0,Cold,LOW
3,4743,97,92.0,34.8,134,89,2171,29.70053,85.87051,5980,0,Rain,MODERATE
4,4522,93,96.3,35.9,145,82,1330,28.55247,80.35179,6773,0,Rain,MODERATE


NA counts:
user_id                   0
hr_bpm                    0
spo2_pct                  0
skin_temp                 0
bloodpressure_systolic    0
bp_diastolic              0
altitude                  0
latitude                  0
longitude                 0
steps                     0
past_incident_flag        0
weather_condition         0
risk_label                0
dtype: int64


In [107]:
# ---------- Encode target (label encoding) ----------
y = df[TARGET_COL].map(RISK_ORDER)
valid = ~y.isna()
df = df.loc[valid].reset_index(drop=True)
y = y.loc[valid].astype(int).values

print("\nLabel counts (0=LOW,1=MODERATE,2=HIGH,3=CRITICAL):")
print(pd.Series(y).value_counts().sort_index())


Label counts (0=LOW,1=MODERATE,2=HIGH,3=CRITICAL):
0    2500
1    2500
2    2500
3    2500
Name: count, dtype: int64


In [108]:

# ---------- Feature matrix ----------
X = df[NUMERIC_COLS + [CAT_COL]].copy()
print(X.head())

   hr_bpm  spo2_pct  skin_temp  bloodpressure_systolic  bp_diastolic  \
0     108      86.5       39.3                     158           115   
1     108      92.6       38.0                     124            88   
2      68     100.0       34.4                     118            92   
3      97      92.0       34.8                     134            89   
4      93      96.3       35.9                     145            82   

   altitude  latitude  longitude  steps  past_incident_flag weather_condition  
0      3522  28.41182   85.67296   1234                   0              Rain  
1       528  28.97789   87.50397   5824                   0               Hot  
2       543  26.51370   80.27189  10350                   0              Cold  
3      2171  29.70053   85.87051   5980                   0              Rain  
4      1330  28.55247   80.35179   6773                   0              Rain  


In [109]:
# ---------- Preprocessing pipeline ----------
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),  # remove this step if you don't want scaling
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

pre = ColumnTransformer(
    transformers=[
        ("num", num_pipe, NUMERIC_COLS),
        ("cat", cat_pipe, [CAT_COL]),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

In [110]:

# First: split into train+temp (70%) and test (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Then: split temp into validation (15%) and test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Shapes:")
print(f"  Train: {X_train.shape}, {y_train.shape}")
print(f"  Val  : {X_val.shape}, {y_val.shape}")
print(f"  Test : {X_test.shape}, {y_test.shape}")

print(X_train.head())


Shapes:
  Train: (7000, 11), (7000,)
  Val  : (1500, 11), (1500,)
  Test : (1500, 11), (1500,)
      hr_bpm  spo2_pct  skin_temp  bloodpressure_systolic  bp_diastolic  \
377       89      93.0       37.6                     136            78   
6987     100      98.2       36.8                     130            89   
8462      94      96.9       36.7                     145            87   
9946      82     100.0       33.8                     135            70   
1485     152      86.5       40.6                     170           125   

      altitude  latitude  longitude  steps  past_incident_flag  \
377        651  29.79325   81.55139   4881                   0   
6987      1951  27.67594   88.05201   6514                   0   
8462      1549  27.05450   87.20646   5861                   0   
9946       257  30.24298   87.66276   9299                   0   
1485      3589  27.33248   88.16569   2329                   1   

     weather_condition  
377                Fog  
6987   

In [111]:
# %%
# === Fit preprocessor on training set and transform all ===

X_train_proc = pre.fit_transform(X_train)
X_val_proc   = pre.transform(X_val)
X_test_proc  = pre.transform(X_test)

print("Processed shapes:")
print(f"  X_train_proc: {X_train_proc.shape}")
print(f"  X_val_proc  : {X_val_proc.shape}")
print(f"  X_test_proc : {X_test_proc.shape}")

# Get feature names
try:
    feature_names = pre.get_feature_names_out()
except Exception:
    feature_names = None

if feature_names is not None:
    print("First 20 feature names:", list(feature_names[:20]))
    



Processed shapes:
  X_train_proc: (7000, 18)
  X_val_proc  : (1500, 18)
  X_test_proc : (1500, 18)
First 20 feature names: ['hr_bpm', 'spo2_pct', 'skin_temp', 'bloodpressure_systolic', 'bp_diastolic', 'altitude', 'latitude', 'longitude', 'steps', 'past_incident_flag', 'weather_condition_Clear', 'weather_condition_Cold', 'weather_condition_Fog', 'weather_condition_Hot', 'weather_condition_Rain', 'weather_condition_Snow', 'weather_condition_Storm', 'weather_condition_Windy']


In [112]:
print(X_train_proc[:5])

[[-6.20035573e-01  3.94192207e-01  3.41630285e-01 -4.55226972e-01
  -9.24055194e-01 -1.07762643e+00  1.13255925e+00 -1.08137048e+00
  -5.36618355e-02 -6.23609564e-01  0.00000000e+00  0.00000000e+00
   1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-2.47688304e-01  1.30070752e+00 -5.49932493e-04 -6.68836331e-01
  -2.80905896e-01 -2.63249290e-01 -5.98845894e-01  1.63834939e+00
   4.06608291e-01 -6.23609564e-01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-4.50786814e-01  1.07407869e+00 -4.33224597e-02 -1.34812935e-01
  -3.97842132e-01 -5.15079760e-01 -1.10702108e+00  1.28458942e+00
   2.22556611e-01 -6.23609564e-01  0.00000000e+00  0.00000000e+00
   1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-8.56983834e-01  1.61450128e+00 -1.28372575e+00 -4.90828532e-01
  -1.39180014e+00 -1.32444535e+00  1.

In [113]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, regularizers, callbacks, models

# ---------- 1) Build a feature-weights vector to up-weight key features ----------
# feature_names should come from pre.get_feature_names_out()
# Expect numeric names as-is (hr_bpm, spo2_pct, ...) and categorical like 'weather_condition=Rain' or 'weather_condition_Rain'
fn = np.array(feature_names)  # shape: (n_features,)
n_features = fn.shape[0]

feat_w = np.ones(n_features, dtype="float32")

def set_weight(col_name_substr, w=1.0):
    idx = np.where(np.char.find(fn.astype(str), col_name_substr) >= 0)[0]
    if idx.size > 0:
        feat_w[idx] = w

# Key vitals → higher weights
set_weight("hr_bpm", 1.8)
set_weight("spo2_pct", 2.0)
set_weight("bloodpressure_systolic", 1.6)
set_weight("bp_diastolic", 1.6)
set_weight("weather_condition_Storm", 2.0)

# All weather one-hot columns → moderate boost
set_weight(f"{CAT_COL}", 1.3)   # matches any weather_condition* column

# (Optional) cap weights
feat_w = np.clip(feat_w, 1.0, 2.0).astype("float32")


In [114]:
# ---------- 3) Define model (feature-weighting layer + regularized MLP) ----------
tf.random.set_seed(42)

inp = layers.Input(shape=(n_features,), name="input")

# Fixed feature-weighting: multiply inputs by clinical weights
w_const = tf.constant(feat_w, dtype=tf.float32)
x = layers.Lambda(lambda t: t * w_const, name="feature_weighting")(inp)

# Regularized MLP
x = layers.BatchNormalization()(x)
x = layers.Dense(128, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
x = layers.Dropout(0.35)(x)

x = layers.Dense(64, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
x = layers.Dropout(0.35)(x)

x = layers.Dense(32, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
x = layers.Dropout(0.25)(x)

out = layers.Dense(4, activation="softmax", name="softmax")(x)

model = models.Model(inp, out)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [115]:
# ---------- 4) Callbacks for generalization ----------
cbs = [
    callbacks.EarlyStopping(monitor="val_loss", patience=12, restore_best_weights=True),
    callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=1e-5, verbose=1)
]

In [116]:
# ---------- 5) Train ----------
fit_kwargs = dict(
    x=X_train_proc,
    y=y_train,
    validation_data=(X_val_proc, y_val),
    epochs=100,
    batch_size=128,
    callbacks=cbs,
    verbose=1
)

if train_sample_weights is not None:
    fit_kwargs["sample_weight"] = train_sample_weights

hist = model.fit(**fit_kwargs)

Epoch 1/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.5199 - loss: 3.5210 - val_accuracy: 0.8187 - val_loss: 0.3939 - learning_rate: 0.0010
Epoch 2/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8488 - loss: 1.3041 - val_accuracy: 0.9453 - val_loss: 0.1773 - learning_rate: 0.0010
Epoch 3/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9438 - loss: 0.6311 - val_accuracy: 0.9787 - val_loss: 0.0813 - learning_rate: 0.0010
Epoch 4/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9624 - loss: 0.4144 - val_accuracy: 0.9913 - val_loss: 0.0467 - learning_rate: 0.0010
Epoch 5/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9726 - loss: 0.3349 - val_accuracy: 0.9960 - val_loss: 0.0341 - learning_rate: 0.0010
Epoch 6/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0

In [117]:
# ---------- 6) Evaluate ----------
val_loss, val_acc = model.evaluate(X_val_proc, y_val, verbose=0)
test_loss, test_acc = model.evaluate(X_test_proc, y_test, verbose=0)
print(f"Validation  Acc: {val_acc:.4f}  |  Test Acc: {test_acc:.4f}")


Validation  Acc: 0.9987  |  Test Acc: 0.9993


In [119]:
import numpy as np
import pandas as pd

# Example: critical instance (you can replace this with your own)
sample_critical = pd.DataFrame([{
    "hr_bpm": 90,
    "spo2_pct": 98.8,
    "skin_temp": 33.7,
    "bloodpressure_systolic": 120,
    "bp_diastolic": 78,
    "altitude": 300,
    "latitude": 27.71,
    "longitude": 85.33,
    "steps": 10000,
    "past_incident_flag": 1,
    "weather_condition": "Storm"
}])

    
# ✅ Transform through preprocessor (exact same as training)
X_sample_proc = pre.transform(sample_critical)

# 🔮 Predict class probabilities and label
probs = model.predict(X_sample_proc)
pred_class = np.argmax(probs, axis=1)[0]
pred_label = ["LOW", "MODERATE", "HIGH", "CRITICAL"][pred_class]



print(f"Predicted Risk Level: {pred_label}")
print("\nClass probabilities:")
for lab, p in zip(["LOW", "MODERATE", "HIGH", "CRITICAL"], probs[0]):
    print(f"{lab:10s}: {p:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Predicted Risk Level: LOW

Class probabilities:
LOW       : 0.9999
MODERATE  : 0.0001
HIGH      : 0.0000
CRITICAL  : 0.0000
