In [76]:
import numpy as np
import pandas as pd

df_history = pd.read_csv(r'D:\AnacondaProj\KKKaGGLe\Tri\triagegeist\data\patient_history.csv')

df_train = pd.read_csv(r'D:\AnacondaProj\KKKaGGLe\Tri\triagegeist\data\train.csv')

df_chief = pd.read_csv(r'D:\AnacondaProj\KKKaGGLe\Tri\triagegeist\data\chief_complaints.csv')





df_full = df_train.merge(df_history, on="patient_id", how="left")
df_full = df_full.merge(
    df_chief[["patient_id", "chief_complaint_raw"]],
    on="patient_id",
    how="left"
)

df_full.shape

(80000, 66)

In [77]:
df = df_full.copy()

In [78]:
# Leakage = future information.
# Drop Leakage Features


df = df.drop(columns=[
    "disposition",
    "ed_los_hours",
    "triage_nurse_id",
    "patient_id"
])

In [79]:
# Fix pain_score
df["pain_missing"] = (df["pain_score"] == -1).astype(int)
df.loc[df["pain_score"] == -1, "pain_score"] = 0

In [80]:
# Handle Missing Vitals
# Create missing indicators first:
vitals_missing_cols = [
    "systolic_bp",
    "diastolic_bp",
    "respiratory_rate",
    "temperature_c",
    "shock_index"
]

for col in vitals_missing_cols:
    df[col + "_missing"] = df[col].isna().astype(int)
    df[col] = df[col].fillna(df[col].median())


In [81]:
# Create total_hx
# We won’t use all 25 hx features initially.
hx_cols = [col for col in df.columns if col.startswith("hx_")]
df["total_hx"] = df[hx_cols].sum(axis=1)

In [82]:
# Simple keyword flags

df_full["has_severe"] = df_full["chief_complaint_raw"].str.contains("severe", case=False, na=False).astype(int)
df_full["has_acute"] = df_full["chief_complaint_raw"].str.contains("acute", case=False, na=False).astype(int)
df_full["has_massive"] = df_full["chief_complaint_raw"].str.contains("massive", case=False, na=False).astype(int)
df_full["has_rigors"] = df_full["chief_complaint_raw"].str.contains("rigors", case=False, na=False).astype(int)

df_full["cc_length"] = df_full["chief_complaint_raw"].str.len()

In [83]:
# Select Structured Features
features = [
    "shock_index",
    "gcs_total",
    "respiratory_rate",
    "spo2",
    "heart_rate",
    "pain_score",
    "num_comorbidities",
    "num_prior_ed_visits_12m",
    "num_prior_admissions_12m",

    # Text additions
    "has_severe",
    "has_acute",
    "has_massive",
    "has_rigors",
    "cc_length"
]

# features = ["news2_score"]

categorical_cols = [
    "mental_status_triage",
    "sex",
    "age_group"
]


# One-hot encode:
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


# Update features:
features += [col for col in df.columns if col.startswith("mental_status_triage_")]
features += [col for col in df.columns if col.startswith("sex_")]
features += [col for col in df.columns if col.startswith("age_group_")]

In [84]:
# Define X and y
X = df[features]
y = df["triage_acuity"]

KeyError: "['has_severe', 'has_acute', 'has_massive', 'has_rigors', 'cc_length'] not in index"

In [None]:
# Train/Validation Split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
# import sys
# !{sys.executable} -m pip install lightgbm

In [None]:
import lightgbm as lgb

model = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=5,
    class_weight="balanced",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    random_state=42
)

model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001604 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1025
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 18
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438


In [None]:


# Define cost matrix again
classes = [1,2,3,4,5]

cost_matrix = pd.DataFrame(
    np.zeros((5,5)),
    index=classes,
    columns=classes
)

for true in classes:
    for pred in classes:
        diff = pred - true
        
        if diff > 0:  # undertriage (predict less urgent than true)
            cost_matrix.loc[true, pred] = abs(diff) * 2
        elif diff < 0:  # overtriage
            cost_matrix.loc[true, pred] = abs(diff) * 1
        else:
            cost_matrix.loc[true, pred] = 0


# Define cost function
def compute_cost(y_true, y_pred, cost_matrix):
    total_cost = 0
    for t, p in zip(y_true, y_pred):
        total_cost += cost_matrix.loc[t, p]
    return total_cost / len(y_true)

In [None]:
# predict 
y_pred = model.predict(X_val)

# Evaluate
val_cost = compute_cost(y_val.values, y_pred, cost_matrix)
print("Validation Cost:", val_cost)

Validation Cost: 0.328125


In [None]:
from sklearn.metrics import cohen_kappa_score, classification_report

print("QWK:", cohen_kappa_score(y_val, y_pred, weights="quadratic"))
print(classification_report(y_val, y_pred))

QWK: 0.9093840451752442
              precision    recall  f1-score   support

           1       0.93      0.95      0.94       644
           2       0.97      0.96      0.97      2688
           3       0.89      0.83      0.86      5784
           4       0.74      0.66      0.70      4604
           5       0.63      0.88      0.74      2280

    accuracy                           0.82     16000
   macro avg       0.83      0.86      0.84     16000
weighted avg       0.83      0.82      0.82     16000



Our structured model significantly outperformed NEWS2-only baselines, suggesting that nonlinear interactions between raw vital signs provide additional triage signal beyond fixed threshold early warning systems.

In [91]:
# ===== STEP 1: SIMPLE TEXT FEATURES =====

df_full["has_severe"] = df_full["chief_complaint_raw"].str.contains(
    "severe", case=False, na=False
).astype(int)

df_full["has_acute"] = df_full["chief_complaint_raw"].str.contains(
    "acute", case=False, na=False
).astype(int)

df_full["has_massive"] = df_full["chief_complaint_raw"].str.contains(
    "massive", case=False, na=False
).astype(int)

df_full["has_rigors"] = df_full["chief_complaint_raw"].str.contains(
    "rigors", case=False, na=False
).astype(int)

df_full["cc_length"] = df_full["chief_complaint_raw"].str.len()

In [92]:
# ===== STEP 2: FEATURE LIST =====

features = [
    # Structured vitals
    "shock_index",
    "gcs_total",
    "respiratory_rate",
    "spo2",
    "heart_rate",
    "pain_score",

    # History
    "num_comorbidities",
    "num_prior_ed_visits_12m",
    "num_prior_admissions_12m",

    # Text additions
    "has_severe",
    "has_acute",
    "has_massive",
    "has_rigors",
    "cc_length"
]

X = df_full[features]
y = df_full["triage_acuity"]

In [93]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [94]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=300,
    class_weight={
        1: 5,
        2: 3,
        3: 1,
        4: 1,
        5: 1
    },
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)

In [95]:
import numpy as np
import pandas as pd

classes = [1,2,3,4,5]

cost_matrix = pd.DataFrame(
    np.zeros((5,5)),
    index=classes,
    columns=classes
)

for true in classes:
    for pred in classes:
        diff = pred - true
        if diff > 0:      # undertriage
            cost_matrix.loc[true, pred] = abs(diff) * 2
        elif diff < 0:    # overtriage
            cost_matrix.loc[true, pred] = abs(diff)
        else:
            cost_matrix.loc[true, pred] = 0

def compute_cost(y_true, y_pred, cost_matrix):
    total_cost = 0
    for t, p in zip(y_true, y_pred):
        total_cost += cost_matrix.loc[t, p]
    return total_cost / len(y_true)

In [96]:
from sklearn.metrics import cohen_kappa_score, classification_report

val_cost = compute_cost(y_val.values, y_pred, cost_matrix)

print("Validation Cost:", val_cost)
print("QWK:", cohen_kappa_score(y_val, y_pred, weights="quadratic"))
print(classification_report(y_val, y_pred))

Validation Cost: 0.2795625
QWK: 0.9113628409019522
              precision    recall  f1-score   support

           1       0.97      0.90      0.93       605
           2       0.97      0.95      0.96      2685
           3       0.86      0.87      0.87      5823
           4       0.72      0.71      0.72      4551
           5       0.72      0.74      0.73      2336

    accuracy                           0.82     16000
   macro avg       0.85      0.84      0.84     16000
weighted avg       0.82      0.82      0.82     16000



Current Best Model (Structured + Text + Class Weighting)

Cost: 0.2796

QWK: 0.9114

ESI-1 Recall: 0.90

Accuracy: 0.82

Compared to:

NEWS2 Only

QWK ≈ 0.816

Cost ≈ 0.63

This is a massive improvement.