Kunaal Agarwal (aad5ha), Daivik Siddhi (awr7mj), Shaurya Bedi (wvr4fe)

In [141]:
import pandas as pd
import numpy as np
import random
import json
import re

from transformers import pipeline

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Auto-triage Proof of Concept simulation

### Emergency Department Entry Survey

As patients enter the Emergency Room (ER) they often complete a brief health questionnaire. The questionnaire includes the relevant medical information that clinicans quickly sift through before assessing the patient themselves. We've simulated the results of this questionnaire and included a plethora of initial features. The core features revolve around symptomology, with an associated severity score, patient demographics, vitals, and other key components faciltating high-quality medical care. 

In [2]:
# Symptom severity labels: {0: mild, 1: moderate, 2: severe}
symptom_templates = [
    ("Chest pain and trouble breathing", 2),
    ("Sudden weakness on one side of body", 2),
    ("Severe headache with vision changes", 2),
    ("Pounding headache with neck stiffness", 2),
    ("Severe allergic reaction with hives", 2),
    ("Repeated episodes of chest tightness", 2),
    ("Confusion and slurred speech", 2),
    ("Abdominal pain and vomiting", 1),
    ("Sharp abdominal cramps", 1),
    ("High fever and body aches", 1),
    ("Moderate shortness of breath when walking", 1),
    ("Chronic cough worsening over weeks", 1),
    ("Persistent diarrhea and dehydration", 1),
    ("Back pain after lifting heavy object", 1),
    ("Nausea and lightheadedness", 1),
    ("Multiple falls, leg pain", 1),
    ("Intermittent palpitations", 1),
    ("Cut on hand with mild bleeding", 0),
    ("Twisted ankle, slight swelling", 0),
    ("Mild rash on arms", 0),
    ("Sore throat and cough", 0)
]

genders = ['Male', 'Female']
ethnicities = ['White', 'Black', 'Hispanic', 'Asian', 'Other']
insurance_options = ['Private', 'Medicare', 'Medicaid', 'Uninsured']
chronic_diseases = ['None', 'Hypertension', 'Diabetes', 'Asthma', 'COPD', 'Heart Disease', 'Chronic Kidney Disease']
medications = ['Aspirin', 'Metformin', 'Lisinopril', 'Albuterol', 'Atorvastatin', 'Insulin', 'Warfarin']
allergies = ['None', 'Penicillin', 'Peanuts', 'Latex', 'NSAIDs']
languages = ['English', 'Spanish', 'Other']
arrival_modes = ['Ambulance', 'Walk-in', 'Referral']
times_of_day = ['Morning', 'Afternoon', 'Evening', 'Night']

In [93]:
random.seed(42)
np.random.seed(42)

n_samples = 100
records = []
id = 1


for _ in range(n_samples):
    gender = random.choice(genders)
    ethnicity = random.choice(ethnicities)
    insurance = random.choice(insurance_options)
    age = np.random.randint(18, 90)
    pain_level = np.random.randint(0, 11)
    duration = round(np.random.exponential(scale=6), 1)
    
    noise = np.random.rand()
    if noise < 0.05:
        label = min(label + 1, 2)
    elif noise > 0.95:
        label = max(label - 1, 0)
    
    chronic = random.choice(chronic_diseases)
    if chronic == 'None':
        meds = ['None']
    else:
        meds = random.sample(medications, k=np.random.randint(1, 3))
    
    heart_rate = np.random.randint(50, 140)
    systolic_bp = np.random.randint(90, 181)
    diastolic_bp = np.random.randint(60, 101)
    respiratory_rate = np.random.randint(12, 31)
    temperature_c = round(np.random.normal(loc=37, scale=1), 1)  # around normal
    oxygen_sat = np.random.randint(85, 101)

    arrival = random.choice(arrival_modes)
    time_day = random.choice(times_of_day)
    
    allergy = random.choice(allergies)
    language = random.choice(languages)
    
    height_cm = np.random.randint(150, 201)
    weight_kg = np.random.randint(50, 121)
    bmi = round(weight_kg / ((height_cm / 100) ** 2), 1)
    
    records.append({
        'patient_id': id,
        'age': age,
        'gender': gender,
        'ethnicity': ethnicity,
        'insurance_status': insurance,
        'pain_level': pain_level,
        'symptom_duration_hrs': duration,
        'symptom_description': desc,
        'chronic_disease': chronic,
        'current_medications': ", ".join(meds),
        'heart_rate': heart_rate,
        'systolic_bp': systolic_bp,
        'diastolic_bp': diastolic_bp,
        'respiratory_rate': respiratory_rate,
        'temperature_c': temperature_c,
        'oxygen_saturation': oxygen_sat,
        'arrival_mode': arrival,
        'time_of_day': time_day,
        'known_allergies': allergy,
        'language_proficiency': language,
        'height_cm': height_cm,
        'weight_kg': weight_kg,
        'bmi': bmi,
    })
    id += 1

df_enhanced = pd.DataFrame(records)
df_enhanced.to_csv('data/synthetic_patient_survey_data.csv', index=False)
df_enhanced.head()

Unnamed: 0,patient_id,age,gender,ethnicity,insurance_status,pain_level,symptom_duration_hrs,symptom_description,chronic_disease,current_medications,...,respiratory_rate,temperature_c,oxygen_saturation,arrival_mode,time_of_day,known_allergies,language_proficiency,height_cm,weight_kg,bmi
0,1,69,Male,White,Medicaid,10,9.1,Chest pain and trouble breathing,Hypertension,"Metformin, Warfarin",...,22,37.3,92,Referral,Morning,NSAIDs,English,152,71,30.7
1,2,70,Female,White,Private,1,7.7,Chest pain and trouble breathing,,,...,12,38.0,96,Ambulance,Afternoon,NSAIDs,Other,171,98,33.5
2,3,76,Male,Other,Medicare,9,0.3,Chest pain and trouble breathing,Heart Disease,Insulin,...,16,35.8,91,Referral,Night,Penicillin,Spanish,167,53,19.0
3,4,77,Female,White,Medicare,1,2.9,Chest pain and trouble breathing,Heart Disease,"Albuterol, Lisinopril",...,23,37.4,97,Walk-in,Afternoon,Penicillin,Spanish,157,96,38.9
4,5,52,Male,White,Uninsured,0,1.8,Chest pain and trouble breathing,,,...,29,37.2,100,Walk-in,Evening,NSAIDs,Spanish,164,111,41.3


### Conversational agent interfrace simulation

After the clinicians review the health questionnaires they conduct an initial interview to learn more about the symptomology. This is the main process that we intend to simplify with Auto-triage by deploying an automated agent to conduct these interviews simulataneously. This saves the clinicians a significant amount of time and allows them to focus on procedures and deep diagnostic thinking.

Rather than building out a complete conversational interface we've opted to simulate this via using a LLM API to both create custom questions and answer on behalf of the patient. 

After testing we found that the outputs of most models were simply not up to par. To avoid paying API related fees, we directly prompted openAI-o3 and provided it our synthetic dataset. We were able to generate the most realistic prompts through this method. 

In [98]:
survey_data = pd.read_csv('data/synthetic_patient_survey_data.csv')

generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-large", # attempted smaller models, distillgpt2, flan-t5-base, flan-t5-small, but they were not able to generate the required output
    device_map="auto", 
    max_new_tokens=200,
)

def extract_numbered(text, n):

    lines = re.findall(r'^\s*\d+\.\s*(.*)$', text, flags=re.MULTILINE)
    return lines[:n]

def generate_questions(patient):
    prompt = (
        "You are a nurse reviewing a patient. Generate 5 medical questions for the PATIENT. "
        "To learn more about their ILLNESS, output EXACTLY five concise questions as a numbered list "
        "(1. … 2. … 3. … 4. … 5.).\n\n"
        "Patient profile:\n"
        f"- Pain level: {patient['pain_level']}/10\n"
        f"- Symptom duration: {patient['symptom_duration_hrs']} hours\n"
        f"- Chief complaint: “{patient['symptom_description']}”"
    )

    out = generator(prompt)[0]['generated_text']
    print(out)
    questions = re.findall(r'^\s*\d+\.\s*(.*)$', out, flags=re.MULTILINE)
    return questions[:5]

def generate_responses(patient, questions):
    prompt_lines = [
        "You are the patient. Based on your profile, answer each numbered follow‑up question in 1–2 sentences,",
        "preserving the numbering exactly.",
        "",
        "Patient profile:",
        f"- Age: {patient['age']}",
        f"- Gender: {patient['gender']}",
        f"- Chronic disease: {patient['chronic_disease']}",
        f"- Current medications: {patient['current_medications']}",
        f"- Pain level: {patient['pain_level']}/10",
        f"- Chief complaint: “{patient['symptom_description']}”",
        "",
        "Follow‑up questions:"
    ]
    prompt = "\n".join(prompt_lines + [f"{i+1}. {q}" for i, q in enumerate(questions)])
    out = generator(prompt)[0]['generated_text']
    return extract_numbered(out, len(questions))

Some parameters are on the meta device because they were offloaded to the disk and cpu.
Device set to use cpu


In [None]:
results = []
for idx, row in survey_data.iterrows():
    if idx == 0: continue
    patient = row.to_dict()
    qs = generate_questions(patient)
    rs = generate_responses(qs)
    entry = {'patient_id': idx}
    for i, (q, r) in enumerate(zip(qs, rs), start=1):
        entry[f'question{i}'] = json.dumps({'question': q, 'response': r})
    results.append(entry)
    break

output_df = pd.DataFrame(results).set_index('patient_id')
output_df.to_csv('patient_q_and_responses.csv')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [80]:
conversation_data = pd.read_csv('data/patient_followup_questions.csv')
conversation_data.head()

Unnamed: 0,patient_id,Q1,Q2,Q3,Q1_response,Q2_response,Q3_response
0,1,Where exactly in your abdomen do you feel the ...,Have you had any changes in your bowel movemen...,Have you eaten anything unusual within the pas...,"It's a sharp, stabbing pain in the lower right...",Yes—I've been a bit constipated since last nig...,I had some street tacos from a new food truck ...
1,2,When did the confusion or speech difficulty fi...,Have you had any weakness or numbness on one s...,Did you experience any recent head injury or f...,The confusion and trouble speaking started abo...,"Yes, my left arm feels weak and a little numb.",No head injuries or recent falls.
2,3,"Can you describe the chest pain—is it sharp, p...","Does the pain radiate to your arm, jaw, or back?","When did the pain start, and has it been const...",It feels like a heavy pressure right in the ce...,Sometimes the pressure shoots down my left arm...,It came on about 30 minutes ago and hasn't let...
3,4,Is this headache different from ones you've ha...,Do bright lights or loud sounds make the pain ...,Have you experienced nausea or vomiting with t...,Yes—this headache is the worst I've ever had a...,Bright lights and loud sounds definitely make ...,I've been nauseous and actually vomited once e...
4,5,When did you last receive a tetanus shot?,Have you thoroughly cleaned the cut since it h...,"Is there any numbness, tingling, or decreased ...",I think my last tetanus shot was about seven y...,I rinsed the cut with soap and water right aft...,"No numbness or tingling, just a bit of stiffness."


### Predictive Model to generate a risk-severity score

Now with both the synthetic survey data and conversational follow-ups we are going to train a classifier to predict a severity score (1 to 10). This will be a supervised learning task as we will be manually labeling the severity for each patient. We will use multiple models, starting with Naive Bayes Classifiers, and increase complexity as we move forward.

In [105]:
combined_data = survey_data.merge(conversation_data, how='left', left_on='patient_id', right_on='patient_id')
combined_data.to_csv('data/combined_patient_data.csv', index=False)

data = pd.read_csv('data/combined_patient_data_labeled.csv')
data = data.rename(columns={'Severity': 'severity'})
data.head(1)

Unnamed: 0,patient_id,age,gender,ethnicity,insurance_status,pain_level,symptom_duration_hrs,symptom_description,chronic_disease,current_medications,...,height_cm,weight_kg,bmi,Q1,Q2,Q3,Q1_response,Q2_response,Q3_response,severity
0,1,69,Male,White,Medicaid,10,9.1,Abdominal pain and vomiting,Hypertension,"Metformin, Insulin",...,152,71,30.7,Where exactly in your abdomen do you feel the ...,Have you had any changes in your bowel movemen...,Have you eaten anything unusual within the pas...,"It's a sharp, stabbing pain in the lower right...",Yes—I've been a bit constipated since last nig...,I had some street tacos from a new food truck ...,7


In [116]:
print(data['severity'].value_counts())
print(data.info())

severity
7    20
5    20
8    16
2    12
6    11
9    10
3     7
4     4
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   patient_id            100 non-null    int64  
 1   age                   100 non-null    int64  
 2   gender                100 non-null    object 
 3   ethnicity             100 non-null    object 
 4   insurance_status      100 non-null    object 
 5   pain_level            100 non-null    int64  
 6   symptom_duration_hrs  100 non-null    float64
 7   symptom_description   100 non-null    object 
 8   chronic_disease       100 non-null    object 
 9   current_medications   100 non-null    object 
 10  heart_rate            100 non-null    int64  
 11  systolic_bp           100 non-null    int64  
 12  diastolic_bp          100 non-null    int64  
 13  respiratory_rate      100 n

In [112]:
def combine_text(row):
    parts = [row['symptom_description']]
    # Q1–Q3 and their responses
    for i in range(1, 4):
        parts.append(row[f'Q{i}'])
        parts.append(row[f'Q{i}_response'])
    return " ".join(parts)

data['text_features'] = data.apply(combine_text, axis=1)

data['chronic_disease'] = data['chronic_disease'].fillna('None')
data['current_medications'] = data['current_medications'].fillna('None')
data['known_allergies'] = data['known_allergies'].fillna('None')

numeric_cols = [
    'age', 'pain_level', 'symptom_duration_hrs',
    'heart_rate', 'systolic_bp', 'diastolic_bp',
    'respiratory_rate', 'temperature_c', 'oxygen_saturation',
    'height_cm', 'weight_kg', 'bmi'
]
X_num = data[numeric_cols].values
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)

cat_cols = [
    'gender', 'ethnicity', 'insurance_status',
    'arrival_mode', 'time_of_day', 'language_proficiency',
    'chronic_disease'
]
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = ohe.fit_transform(data[cat_cols])

embedder = SentenceTransformer('all-MiniLM-L6-v2')
X_text = embedder.encode(data['text_features'].tolist(), show_progress_bar=True)

X = np.hstack([X_num_scaled, X_cat, X_text])
y = data['severity'].astype(int).values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Batches: 100%|██████████| 4/4 [00:00<00:00,  6.42it/s]

Training set shape: (80, 424)
Test set shape: (20, 424)





### Naive Bayes (Gaussian) Model

In [126]:
nb = GaussianNB()

nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

acc = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Accuracy: {acc*100}%")
print(classification_report(y_test, y_pred_nb, digits=3, zero_division=0))

Naive Bayes Accuracy: 65.0%
              precision    recall  f1-score   support

           2      1.000     0.667     0.800         3
           3      0.000     0.000     0.000         1
           4      0.000     0.000     0.000         1
           5      0.364     1.000     0.533         4
           6      0.000     0.000     0.000         2
           7      1.000     0.750     0.857         4
           8      1.000     1.000     1.000         3
           9      1.000     0.500     0.667         2

    accuracy                          0.650        20
   macro avg      0.545     0.490     0.482        20
weighted avg      0.673     0.650     0.615        20



### Random Forest Classifier

In [130]:
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    class_weight='balanced', 
    random_state=42
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

acc_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {acc_rf:.3f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred_rf, digits=3))

print("Confusion Matrix (rows=true, columns=predicted):")
print(confusion_matrix(y_test, y_pred_rf))


Random Forest Accuracy: 1.000

Classification Report:
              precision    recall  f1-score   support

           2      1.000     1.000     1.000         3
           3      1.000     1.000     1.000         1
           4      1.000     1.000     1.000         1
           5      1.000     1.000     1.000         4
           6      1.000     1.000     1.000         2
           7      1.000     1.000     1.000         4
           8      1.000     1.000     1.000         3
           9      1.000     1.000     1.000         2

    accuracy                          1.000        20
   macro avg      1.000     1.000     1.000        20
weighted avg      1.000     1.000     1.000        20

Confusion Matrix (rows=true, columns=predicted):
[[3 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0]
 [0 0 0 4 0 0 0 0]
 [0 0 0 0 2 0 0 0]
 [0 0 0 0 0 4 0 0]
 [0 0 0 0 0 0 3 0]
 [0 0 0 0 0 0 0 2]]


In [133]:
rf_tuned = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,           
    min_samples_leaf=5,      
    class_weight='balanced',
    random_state=42
)
cv_scores_tuned = cross_val_score(rf_tuned, X, y, cv=5, scoring='accuracy', n_jobs=-1)
print("Tuned 5‑fold CV accuracies:", cv_scores_tuned)
print("Mean tuned accuracy:    ", cv_scores_tuned.mean())



Tuned 5‑fold CV accuracies: [1.  1.  1.  1.  0.9]
Mean tuned accuracy:     0.9800000000000001


## PCA Analysis for dimensionality reduction.

In [142]:
pipe = Pipeline([
    ("pca", PCA(n_components=0.95, random_state=42)),
    ("clf", RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_leaf=5,
        class_weight='balanced',
        random_state=42
    ))
])

scores = cross_val_score(pipe, np.hstack([X_num_scaled, X_cat, X_text]), y,
                         cv=5, scoring='accuracy', n_jobs=-1)
print("PCA(95%) + RF CV accuracies:", scores)
print("Mean accuracy:", scores.mean())



PCA(95%) + RF CV accuracies: [0.15 0.15 0.15 0.15 0.2 ]
Mean accuracy: 0.16
