In [None]:
!pip install openai --upgrade



In [None]:
import pandas as pd
import numpy as np
import json
import time
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, RocCurveDisplay
from torch.utils.data import Dataset, DataLoader
import openai

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
openai.api_key = 'deleted'

In [None]:
# Load discharge notes
discharge_notes = pd.read_csv(
    "/content/drive/My Drive/mimic-iii-clinical-database-1.4/NOTEEVENTS.csv.gz",
    compression='gzip',
    low_memory=False
)
discharge_notes = discharge_notes[discharge_notes["CATEGORY"] == "Discharge summary"]

# Load diagnoses and ICD descriptions
diagnoses = pd.read_csv(
    "/content/drive/My Drive/mimic-iii-clinical-database-1.4/DIAGNOSES_ICD.csv.gz",
    compression='gzip',
    low_memory=False
)
d_icd = pd.read_csv(
    "/content/drive/My Drive/mimic-iii-clinical-database-1.4/D_ICD_DIAGNOSES.csv.gz",
    compression='gzip',
    low_memory=False
)

In [None]:
# Merge diagnoses with descriptions
diagnoses = diagnoses.merge(d_icd, on="ICD9_CODE", how="left")

# Join discharge notes with diagnoses on HADM_ID
joined = discharge_notes.merge(diagnoses, on="HADM_ID", how="inner")

# Group multiple diagnoses per note
grouped = (
    joined.groupby(["HADM_ID", "TEXT"])
    .agg({"SHORT_TITLE": lambda x: list(set(x.dropna()))})
    .reset_index()
)

# Optional preview
grouped.head()

Unnamed: 0,HADM_ID,TEXT,SHORT_TITLE
0,100001.0,Admission Date: [**2117-9-11**] ...,"[Chr kidney dis stage III, DMI ketoacd uncontr..."
1,100003.0,Admission Date: [**2150-4-17**] ...,"[Cirrhosis of liver NOS, Edema, Ac posthemorrh..."
2,100006.0,Admission Date: [**2108-4-6**] Discharg...,"[Hyposmolality, Chronic obst asthma NOS, Adjus..."
3,100006.0,"Name: [**Known lastname 470**], [**Known firs...","[Hyposmolality, Chronic obst asthma NOS, Adjus..."
4,100007.0,Admission Date: [**2145-3-31**] ...,"[Hypertension NOS, Pneumonia, organism NOS, Ac..."


In [None]:
from openai import OpenAI
import pandas as pd
from tqdm import tqdm

# Initialize OpenAI client
api_key = 'deleted'
client = OpenAI(api_key=api_key)

# --- Prompt Builder ---
def build_heart_condition_prompt(note_text):
    return f"""
You are a clinical reasoning assistant.

Question: Does the following hospital discharge summary indicate a heart-related condition such as congestive heart failure (CHF), myocardial infarction (MI), atrial fibrillation (AFib), cardiomyopathy, or coronary artery disease (CAD)?

Discharge Summary:
\"\"\"{note_text.strip()}\"\"\"

Answer with a single word: Yes or No.
""".strip()

# --- GPT Function ---
def classify_heart_condition(note_text, model="gpt-3.5-turbo", verbose=False):
    prompt = build_heart_condition_prompt(note_text)
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    answer = response.choices[0].message.content.strip().lower()

    if verbose:
        print(f"Prompt:\n{prompt}\n---\nAnswer: {answer}\n")

    return answer

# --- Filter to Narrow Heart Conditions ---
heart_keywords = [
    "congestive heart failure", "chf", "myocardial infarction", "mi",
    "atrial fibrillation", "afib", "coronary artery disease", "cad", "cardiomyopathy"
]

def is_specific_heart_condition(diagnoses):
    return any(any(k in d.lower() for k in heart_keywords) for d in diagnoses)

# Apply filter
grouped["label_heart_condition"] = grouped["SHORT_TITLE"].apply(is_specific_heart_condition)

In [None]:
# Sample evaluation set: 15 positive, 15 negative
pos_samples = grouped[grouped["label_heart_condition"]].sample(15, random_state=42)
neg_samples = grouped[~grouped["label_heart_condition"]].sample(15, random_state=42)
eval_df = pd.concat([pos_samples, neg_samples]).sample(frac=1, random_state=42).reset_index(drop=True)

# Run GPT classification
tqdm.pandas()
eval_df["gpt_answer"] = eval_df["TEXT"].progress_apply(classify_heart_condition)
eval_df["gpt_label"] = eval_df["gpt_answer"].apply(lambda x: 1 if "yes" in x else 0)

# Evaluate
accuracy = (eval_df["gpt_label"] == eval_df["label_heart_condition"]).mean()
print(f"\n GPT Binary Heart Condition Classification Accuracy: {accuracy:.2%}")

# Print a few sample results
for i, row in eval_df.head(5).iterrows():
    print(f"\n--- Sample #{i+1} ---")
    print(f"Ground truth: {'Yes' if row['label_heart_condition'] else 'No'}")
    print(f"GPT answer: {row['gpt_answer']}")
    print(f"Match: {'Yes' if row['gpt_label'] == row['label_heart_condition'] else 'No'}")


100%|██████████| 30/30 [00:20<00:00,  1.44it/s]


 GPT Binary Heart Condition Classification Accuracy: 60.00%

--- Sample #1 ---
Ground truth: No
GPT answer: no
Match: Yes

--- Sample #2 ---
Ground truth: No
GPT answer: no
Match: Yes

--- Sample #3 ---
Ground truth: No
GPT answer: yes
Match: No

--- Sample #4 ---
Ground truth: No
GPT answer: no
Match: Yes

--- Sample #5 ---
Ground truth: Yes
GPT answer: no
Match: No





In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(grouped, test_size=0.2, random_state=42)

In [None]:
# Sample 30 of each class
pos_samples = grouped[grouped["label_heart_condition"]].sample(30, random_state=42)
neg_samples = grouped[~grouped["label_heart_condition"]].sample(30, random_state=42)

# Combine and shuffle
eval_df = pd.concat([pos_samples, neg_samples]).sample(frac=1, random_state=42).reset_index(drop=True)

from tqdm import tqdm
tqdm.pandas()

eval_df["gpt_answer"] = eval_df["TEXT"].progress_apply(classify_heart_condition)
eval_df["gpt_label"] = eval_df["gpt_answer"].apply(lambda x: 1 if "yes" in x else 0)

100%|██████████| 60/60 [00:40<00:00,  1.47it/s]


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

y_true = eval_df["label_heart_condition"]
y_pred = eval_df["gpt_label"]

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

print("\nGPT Classification Stats (Heart Condition - 60 Cases):")
print(f"Accuracy       : {acc:.2%}")
print(f"Precision      : {prec:.2%}")
print(f"Recall         : {rec:.2%}")
print(f"F1 Score       : {f1:.2%}")
print(f"Confusion Matrix: TP={tp}, FP={fp}, FN={fn}, TN={tn}")


GPT Classification Stats (Heart Condition - 60 Cases):
Accuracy       : 63.33%
Precision      : 75.00%
Recall         : 40.00%
F1 Score       : 52.17%
Confusion Matrix: TP=12, FP=4, FN=18, TN=26


In [None]:
def build_deep_reasoning_prompt(note_text):
    return f"""
You are a clinical reasoning assistant. Your task is to determine whether this discharge summary describes a heart-related condition. Think step-by-step like a physician.

Step 1: Identify signs and symptoms relevant to cardiac conditions (e.g. chest pain, dyspnea, syncope).
Step 2: Identify relevant lab or imaging results (e.g. troponin levels, ECG changes).
Step 3: Identify any heart-related diagnoses explicitly stated.
Step 4: Based on the above, decide if this case involves a heart-related condition like congestive heart failure (CHF), myocardial infarction (MI), atrial fibrillation (AFib), cardiomyopathy, or coronary artery disease (CAD).

Discharge Summary:
\"\"\"{note_text.strip()}\"\"\"

Let’s reason through this carefully. At the end, answer with one word: Yes or No.
""".strip()

def classify_heart_condition_reasoning(note_text, model="gpt-3.5-turbo"):
    prompt = build_deep_reasoning_prompt(note_text)
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content.strip().lower()

In [None]:
eval_df["gpt_reasoning_answer"] = eval_df["TEXT"].progress_apply(classify_heart_condition_reasoning)
eval_df["gpt_reasoning_label"] = eval_df["gpt_reasoning_answer"].apply(lambda x: 1 if "yes" in x else 0)

100%|██████████| 60/60 [02:20<00:00,  2.34s/it]


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

y_true = eval_df["label_heart_condition"]
y_reasoning = eval_df["gpt_reasoning_label"]

acc = accuracy_score(y_true, y_reasoning)
prec = precision_score(y_true, y_reasoning)
rec = recall_score(y_true, y_reasoning)
f1 = f1_score(y_true, y_reasoning)
tn, fp, fn, tp = confusion_matrix(y_true, y_reasoning).ravel()

print("\nGPT + Reasoning Classification Stats:")
print(f"Accuracy       : {acc:.2%}")
print(f"Precision      : {prec:.2%}")
print(f"Recall         : {rec:.2%}")
print(f"F1 Score       : {f1:.2%}")
print(f"Confusion Matrix: TP={tp}, FP={fp}, FN={fn}, TN={tn}")


GPT + Reasoning Classification Stats:
Accuracy       : 70.00%
Precision      : 77.27%
Recall         : 56.67%
F1 Score       : 65.38%
Confusion Matrix: TP=17, FP=5, FN=13, TN=25


In [None]:
len(train_df)

47506

In [None]:
from sklearn.neighbors import NearestNeighbors

# Filter long notes
train_df_small = train_df[train_df["TEXT"].str.len() < 30000].copy().head(100)

# Embed
def get_embedding(text, model="text-embedding-ada-002"):
    safe_text = text.strip()[:32000]
    response = client.embeddings.create(model=model, input=[safe_text])
    return response.data[0].embedding

train_df_small["embedding"] = train_df_small["TEXT"].progress_apply(get_embedding)

# Fit NearestNeighbors
X_train = np.vstack(train_df_small["embedding"].values)
nn_model = NearestNeighbors(n_neighbors=1, metric="cosine").fit(X_train)

100%|██████████| 100/100 [00:56<00:00,  1.78it/s]


In [None]:
def build_rag_prompt(test_note_text, similar_examples):
    prompt = "You are a clinical reasoning assistant. Given similar patient cases, determine whether the new discharge summary describes a heart-related condition.\n\n"

    for i, text in enumerate(similar_examples):
        prompt += f"Example {i+1}:\n\"\"\"{text.strip()}\"\"\"\n"

    prompt += """
Now read the new case carefully and follow these steps:
Step 1: Identify any symptoms or complaints suggestive of a heart condition (e.g., chest pain, dyspnea, palpitations).
Step 2: Look for diagnostic findings (e.g., ECG, troponin, echocardiogram).
Step 3: Determine if a heart-related diagnosis (e.g., CHF, MI, AFib, CAD) is stated or strongly implied.
Step 4: Conclude: Is this discharge summary consistent with a heart-related condition?

New Case:
\"\"\"""" + test_note_text.strip() + """\"\"\"

Final answer with one word: Yes or No."""

    return prompt

def classify_heart_condition_rag(note_text, train_df, nn_model, model="gpt-3.5-turbo"):
    safe_text = note_text.strip()[:32000]
    test_emb = get_embedding(safe_text)
    distances, indices = nn_model.kneighbors([test_emb])
    similar_notes = train_df.iloc[indices[0]]["TEXT"].tolist()
    prompt = build_rag_prompt(safe_text, similar_notes)
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content.strip().lower()

In [None]:
eval_df["gpt_rag_answer"] = eval_df["TEXT"].progress_apply(
    lambda x: classify_heart_condition_rag(x, train_df_small, nn_model)
)
eval_df["gpt_rag_label"] = eval_df["gpt_rag_answer"].apply(lambda x: 1 if "yes" in x else 0)

100%|██████████| 60/60 [01:27<00:00,  1.46s/it]


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

y_pred = eval_df["gpt_rag_label"]
y_true = eval_df["label_heart_condition"]

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

print("\n GPT Classification with Embedding-Augmented Prompting (RAG):")
print(f"Accuracy       : {acc:.2%}")
print(f"Precision      : {prec:.2%}")
print(f"Recall         : {rec:.2%}")
print(f"F1 Score       : {f1:.2%}")
print(f"Confusion Matrix: TP={tp}, FP={fp}, FN={fn}, TN={tn}")


 GPT Classification with Embedding-Augmented Prompting (RAG):
Accuracy       : 60.00%
Precision      : 66.67%
Recall         : 40.00%
F1 Score       : 50.00%
Confusion Matrix: TP=12, FP=6, FN=18, TN=24
