In [1]:
import argparse
import itertools
import logging
import os
import random
import re
import traceback
import datetime
from collections import Counter
from pathlib import Path
import math

from scipy import spatial
from scipy.io.arff import loadarff
from sklearn.linear_model import LogisticRegression

import pandas as pd
import numpy as np
from sklearn.model_selection import ParameterGrid, train_test_split
import datasets
from datasets import DatasetDict, Dataset, concatenate_datasets, load_dataset
from transformers import (
    set_seed, AutoTokenizer, AutoModelForCausalLM
)
from transformers import AutoModelForSeq2SeqLM

from helper.note_generator import NoteGenerator
from helper.note_template import NoteTemplate
from helper.external_datasets_variables import *
from helper.preprocess import preprocess


logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = pd.read_csv('Data/mortality.csv')

In [3]:
dataset

Unnamed: 0,group,ID,outcome,age,gendera,BMI,hypertensive,atrialfibrillation,CHD with no MI,diabetes,...,Blood sodium,Blood calcium,Chloride,Anion gap,Magnesium ion,PH,Bicarbonate,Lactic acid,PCO2,EF
0,1,125047,0.0,72,1,37.588179,0,0,0,1,...,138.750000,7.463636,109.166667,13.166667,2.618182,7.230,21.166667,0.5,40.0,55
1,1,139812,0.0,75,2,,0,0,0,0,...,138.888889,8.162500,98.444444,11.444444,1.887500,7.225,33.444444,0.5,78.0,55
2,1,109787,0.0,83,2,26.572634,0,0,0,0,...,140.714286,8.266667,105.857143,10.000000,2.157143,7.268,30.571429,0.5,71.5,35
3,1,130587,0.0,43,2,83.264629,0,0,0,0,...,138.500000,9.476923,92.071429,12.357143,1.942857,7.370,38.571429,0.6,75.0,55
4,1,138290,0.0,75,2,31.824842,1,0,0,0,...,136.666667,8.733333,104.500000,15.166667,1.650000,7.250,22.000000,0.6,50.0,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1172,2,171130,0.0,62,1,25.516870,1,1,0,1,...,136.714286,10.271429,94.428571,20.142857,2.714286,,27.714286,,,40
1173,2,101659,0.0,78,1,25.822710,0,1,0,1,...,135.680000,10.523529,101.720000,18.160000,2.012500,,20.480000,,,30
1174,2,162069,0.0,85,2,23.891779,1,1,0,1,...,136.000000,8.466667,97.285714,14.000000,2.028571,,28.857143,,,55
1175,2,120967,0.0,79,2,35.288554,0,0,1,1,...,140.000000,8.183333,104.000000,15.750000,2.090000,,24.375000,,,25


In [4]:
dataset.columns

Index(['group', 'ID', 'outcome', 'age', 'gendera', 'BMI', 'hypertensive',
       'atrialfibrillation', 'CHD with no MI', 'diabetes', 'deficiencyanemias',
       'depression', 'Hyperlipemia', 'Renal failure', 'COPD', 'heart rate',
       'Systolic blood pressure', 'Diastolic blood pressure',
       'Respiratory rate', 'temperature', 'SP O2', 'Urine output',
       'hematocrit', 'RBC', 'MCH', 'MCHC', 'MCV', 'RDW', 'Leucocyte',
       'Platelets', 'Neutrophils', 'Basophils', 'Lymphocyte', 'PT', 'INR',
       'NT-proBNP', 'Creatine kinase', 'Creatinine', 'Urea nitrogen',
       'glucose', 'Blood potassium', 'Blood sodium', 'Blood calcium',
       'Chloride', 'Anion gap', 'Magnesium ion', 'PH', 'Bicarbonate',
       'Lactic acid', 'PCO2', 'EF'],
      dtype='object')

In [5]:
column_mapping = {col: col.replace(' ', '_') for col in dataset.columns}
dataset.rename(columns=column_mapping, inplace=True)
dataset.rename(columns={'NT-proBNP': 'NTproBNP'}, inplace=True)

In [6]:
dataset.columns

Index(['group', 'ID', 'outcome', 'age', 'gendera', 'BMI', 'hypertensive',
       'atrialfibrillation', 'CHD_with_no_MI', 'diabetes', 'deficiencyanemias',
       'depression', 'Hyperlipemia', 'Renal_failure', 'COPD', 'heart_rate',
       'Systolic_blood_pressure', 'Diastolic_blood_pressure',
       'Respiratory_rate', 'temperature', 'SP_O2', 'Urine_output',
       'hematocrit', 'RBC', 'MCH', 'MCHC', 'MCV', 'RDW', 'Leucocyte',
       'Platelets', 'Neutrophils', 'Basophils', 'Lymphocyte', 'PT', 'INR',
       'NTproBNP', 'Creatine_kinase', 'Creatinine', 'Urea_nitrogen', 'glucose',
       'Blood_potassium', 'Blood_sodium', 'Blood_calcium', 'Chloride',
       'Anion_gap', 'Magnesium_ion', 'PH', 'Bicarbonate', 'Lactic_acid',
       'PCO2', 'EF'],
      dtype='object')

In [7]:
dataset['diabetes'].value_counts(normalize=True)

diabetes
0    0.57859
1    0.42141
Name: proportion, dtype: float64

In [8]:
def train_validation_test_split(data):
    # Don't want to shuffle bc done later with right seed to make it identical with external evaluation
    data_train, data_test = train_test_split(data, test_size=0.20, shuffle=False)
    data_valid, data_test = train_test_split(data_test, test_size=0.50, shuffle=False)
    return data_train, data_valid, data_test

def byte_to_string_columns(data):
    for col, dtype in data.dtypes.items():
        if dtype == object:  # Only process byte object columns.
            data[col] = data[col].apply(lambda x: x.decode("utf-8"))
    return data

In [9]:
dataset = byte_to_string_columns(dataset)
dataset_train, dataset_val, dataset_test = train_validation_test_split(dataset)

In [10]:
dataset_train

Unnamed: 0,group,ID,outcome,age,gendera,BMI,hypertensive,atrialfibrillation,CHD_with_no_MI,diabetes,...,Blood_sodium,Blood_calcium,Chloride,Anion_gap,Magnesium_ion,PH,Bicarbonate,Lactic_acid,PCO2,EF
0,1,125047,0.0,72,1,37.588179,0,0,0,1,...,138.750000,7.463636,109.166667,13.166667,2.618182,7.230000,21.166667,0.500000,40.0,55
1,1,139812,0.0,75,2,,0,0,0,0,...,138.888889,8.162500,98.444444,11.444444,1.887500,7.225000,33.444444,0.500000,78.0,55
2,1,109787,0.0,83,2,26.572634,0,0,0,0,...,140.714286,8.266667,105.857143,10.000000,2.157143,7.268000,30.571429,0.500000,71.5,35
3,1,130587,0.0,43,2,83.264629,0,0,0,0,...,138.500000,9.476923,92.071429,12.357143,1.942857,7.370000,38.571429,0.600000,75.0,55
4,1,138290,0.0,75,2,31.824842,1,0,0,0,...,136.666667,8.733333,104.500000,15.166667,1.650000,7.250000,22.000000,0.600000,50.0,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
936,2,157911,0.0,79,2,20.838931,0,1,0,0,...,145.000000,8.414286,102.444444,10.777778,2.042857,7.314000,36.333333,1.500000,74.4,35
937,2,199912,1.0,89,2,25.080360,1,0,0,1,...,135.666667,7.300000,104.666667,20.333333,1.450000,,15.666667,1.500000,,30
938,2,155024,0.0,76,2,22.160754,0,1,0,0,...,141.153846,7.854545,104.846154,15.076923,2.108333,7.460000,25.153846,1.500000,33.0,30
939,2,119703,0.0,60,2,28.321950,0,0,0,0,...,148.000000,7.966667,109.250000,12.250000,2.228571,7.350000,30.333333,1.500000,49.0,55


In [11]:
dataset['deficiencyanemias'].value_counts(normalize=True)

deficiencyanemias
0    0.661003
1    0.338997
Name: proportion, dtype: float64

In [12]:
template = """
The information of the patient is:
    The age is ${age},
    The gender is ${gendera},
    The BMI index is ${BMI},
    The patient is hypertensive: ${hypertensive},
    The patient has atrial fibrillation: ${atrialfibrillation},
    The patient has Ischaemic heart disease: ${CHD_with_no_MI},
    The patient has diabetes: ${diabetes},
    The patient has Hypoferric anaemia: ${deficiencyanemias},
    The patient has depression: ${depression},
    The patient has Hyperlipemia: ${Hyperlipemia},
    The patient has Chronic renal insufficiency: ${Renal_failure},
    The patient has Chronic obstructive pulmonary disease: ${COPD},
    The heart rate is ${heart_rate},
    The Systolic blood pressure is ${Systolic_blood_pressure},
    The Diastolic blood pressure is ${Diastolic_blood_pressure},
    The Respiratory rate is ${Respiratory_rate},
    The temperature is ${temperature},
    The saturation pulse oxygen is ${SP_O2},
    The Urine output in the first 24 hours is ${Urine_output},
    The hematocrit is ${hematocrit},
    The red blood cell is ${RBC},
    The mean corpuscular hemoglobin is ${MCH},
    The mean corpuscular hemoglobin concentration is ${MCHC},
    The mean corpuscular volume is ${MCV},
    The red cell distribution width is ${RDW},
    The Leucocyte is ${Leucocyte},
    The Platelets are ${Platelets},
    The Neutrophils are ${Neutrophils},
    The Basophils are ${Basophils},
    The Lymphocyte is ${Lymphocyte},
    The Prothrombin time is ${PT},
    The International Normalized Ratio is ${INR},
    The aminoterminal pro B-type natriuretic peptide is ${NTproBNP},
    The Creatine kinase is ${Creatine_kinase},
    The Creatinine is ${Creatinine},
    The Urea nitrogen is ${Urea_nitrogen},
    The glucose is ${glucose},
    The Blood potassium is ${Blood_potassium},
    The Blood sodium is ${Blood_sodium},
    The Blood calcium is ${Blood_calcium},
    The Chloride is ${Chloride},
    The Anion gap is ${Anion_gap},
    The Magnesium ion is ${Magnesium_ion},
    The PH is ${PH},
    The Bicarbonate is ${Bicarbonate},
    The Lactic acid is ${Lactic_acid},
    The partial pressure of carbon dioxide is ${PCO2},
    The ejection fraction is ${EF}.

    From the given information, predict the outcome {Alive,Dead} after the patient is admitted in ICU. Answer: 
"""


In [13]:
dataset.columns

Index(['group', 'ID', 'outcome', 'age', 'gendera', 'BMI', 'hypertensive',
       'atrialfibrillation', 'CHD_with_no_MI', 'diabetes', 'deficiencyanemias',
       'depression', 'Hyperlipemia', 'Renal_failure', 'COPD', 'heart_rate',
       'Systolic_blood_pressure', 'Diastolic_blood_pressure',
       'Respiratory_rate', 'temperature', 'SP_O2', 'Urine_output',
       'hematocrit', 'RBC', 'MCH', 'MCHC', 'MCV', 'RDW', 'Leucocyte',
       'Platelets', 'Neutrophils', 'Basophils', 'Lymphocyte', 'PT', 'INR',
       'NTproBNP', 'Creatine_kinase', 'Creatinine', 'Urea_nitrogen', 'glucose',
       'Blood_potassium', 'Blood_sodium', 'Blood_calcium', 'Chloride',
       'Anion_gap', 'Magnesium_ion', 'PH', 'Bicarbonate', 'Lactic_acid',
       'PCO2', 'EF'],
      dtype='object')

In [14]:
def identify_binary_columns(df):
    binary_columns = []
    for column in df.columns:
        unique_values = df[column].dropna().unique()  # Drop NaN values and find unique values
        if len(unique_values) == 2:  # Check if there are exactly 2 unique values
            binary_columns.append(column)
    return binary_columns

binary_columns = identify_binary_columns(dataset)
print("Binary columns:", binary_columns)

Binary columns: ['group', 'outcome', 'gendera', 'hypertensive', 'atrialfibrillation', 'CHD_with_no_MI', 'diabetes', 'deficiencyanemias', 'depression', 'Hyperlipemia', 'Renal_failure', 'COPD']


In [15]:
dataset['outcome'].value_counts()

outcome
0.0    1017
1.0     159
Name: count, dtype: int64

In [16]:
dataset['Systolic_blood_pressure']

0       155.866667
1       140.000000
2       135.333333
3       126.400000
4       156.560000
           ...    
1172    142.545455
1173    101.222222
1174    137.791667
1175     94.416667
1176    131.523077
Name: Systolic_blood_pressure, Length: 1177, dtype: float64

In [17]:
gender_dict = {1: 'Male', 2: 'Female'}
bmi_dict = {'NA': 'Unknown'}
hypertensive_dict = {1: 'Yes', 0:'No'}
atrialfibrillation_dict = {0:'No',1:'Yes'}
CHD_dict = {0:'No',1:'Yes'}
diabetes_dict = {0:'No',1:'Yes'}
anaemia_dict = {0:'No',1:'Yes'}
depression_dict = {0:'No',1:'Yes'}
Hyperlipemia_dict = {0:'No',1:'Yes'}
renal_dict = {0:'No',1:'Yes'}
COPD_dict = {0:'No',1:'Yes'}
template_config = {
    'pre' : {
        'age': lambda x:f"{int(x)}",
        'gendera': lambda x: gender_dict[x],
        'BMI': lambda x: 'unknown' if (pd.isna(x) or x == 'NA') else f"{x:.2f}",
        'hypertensive': lambda x: hypertensive_dict[x],
        'atrialfibrillation': lambda x: atrialfibrillation_dict[x],
        'CHD_with_no_MI': lambda x: CHD_dict[x],
        'diabetes': lambda x: diabetes_dict[x],
        'deficiencyanemias': lambda x: anaemia_dict[x],
        'depression': lambda x: depression_dict[x],
        'Hyperlipemia': lambda x: Hyperlipemia_dict[x],
        'Renal_failure': lambda x: renal_dict[x],
        'COPD': lambda x: COPD_dict[x],
        'heart_rate': lambda x: f"{x:.2f}",
        'Systolic_blood_pressure': lambda x: f"{x:.2f}",
        'Diastolic_blood_pressure': lambda x: f"{x:.2f}",
        'Respiratory_rate': lambda x: f"{x:.2f}",
        'temperature': lambda x: f"{x:.2f}",
        'SP_O2': lambda x: f"{x:.2f}",
        'Urine_output': lambda x: f"{x:.2f}",
        'hematocrit': lambda x: f"{x:.2f}",
        'Creatinine': lambda x: f"{x:.2f}",
        'glucose': lambda x: f"{x:.2f}",
        'Blood_potassium': lambda x: f"{x:.2f}",
        'Blood_calcium': lambda x: f"{x:.2f}",
        'Chloride': lambda x: f"{x:.2f}",
        'Anion_gap': lambda x: f"{x:.2f}",
        'Magnesium_ion': lambda x: f"{x:.2f}",
        'Bicarbonate': lambda x: f"{x:.2f}",
    }
}

In [18]:
note_generator = NoteTemplate(template, **template_config)

In [19]:
notes = [NoteGenerator.clean_note(note_generator.substitute(r)) for _, r in dataset_train.iterrows()]
old_size_notes = len(notes)
start = 0  # 25000
end = len(notes)
notes = notes[start:end]
#dataset = dataset.iloc[start:end]
print(f"Only consider dataset range between {start} and {end} (total: {old_size_notes})")

Only consider dataset range between 0 and 941 (total: 941)


In [20]:
notes

['\nThe information of the patient is:\nThe age is 72,\nThe gender is Male,\nThe BMI index is 37.59,\nThe patient is hypertensive: No,\nThe patient has atrial fibrillation: No,\nThe patient has Ischaemic heart disease: No,\nThe patient has diabetes: Yes,\nThe patient has Hypoferric anaemia: Yes,\nThe patient has depression: No,\nThe patient has Hyperlipemia: Yes,\nThe patient has Chronic renal insufficiency: Yes,\nThe patient has Chronic obstructive pulmonary disease: No,\nThe heart rate is 68.84,\nThe Systolic blood pressure is 155.87,\nThe Diastolic blood pressure is 68.33,\nThe Respiratory rate is 16.62,\nThe temperature is 36.71,\nThe saturation pulse oxygen is 98.39,\nThe Urine output in the first 24 hours is 2155.00,\nThe hematocrit is 26.27,\nThe red blood cell is 2.96,\nThe mean corpuscular hemoglobin is 28.25,\nThe mean corpuscular hemoglobin concentration is 31.52,\nThe mean corpuscular volume is 89.9,\nThe red cell distribution width is 16.22,\nThe Leucocyte is 7.65,\nThe Pl

In [21]:
#labels = dataset_train['outcome'].apply(lambda x: "Alive" if x == 0 else "Dead").to_list()
labels = dataset_train['outcome']

In [22]:
labels

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
936    0.0
937    1.0
938    0.0
939    0.0
940    0.0
Name: outcome, Length: 941, dtype: float64

In [23]:
train = {'Text': notes, 'label': labels}
train_df = pd.DataFrame(train)

In [24]:
train_df

Unnamed: 0,Text,label
0,\nThe information of the patient is:\nThe age ...,0.0
1,\nThe information of the patient is:\nThe age ...,0.0
2,\nThe information of the patient is:\nThe age ...,0.0
3,\nThe information of the patient is:\nThe age ...,0.0
4,\nThe information of the patient is:\nThe age ...,0.0
...,...,...
936,\nThe information of the patient is:\nThe age ...,0.0
937,\nThe information of the patient is:\nThe age ...,1.0
938,\nThe information of the patient is:\nThe age ...,0.0
939,\nThe information of the patient is:\nThe age ...,0.0


In [25]:
train_df.to_csv('Data/train.csv', index = False)

In [26]:
notes = [NoteGenerator.clean_note(note_generator.substitute(r)) for _, r in dataset_val.iterrows()]
old_size_notes = len(notes)
start = 0  # 25000
end = len(notes)
notes = notes[start:end]
#dataset = dataset.iloc[start:end]
print(f"Only consider dataset range between {start} and {end} (total: {old_size_notes})")
#labels = dataset_val['outcome'].apply(lambda x: "Alive" if x == 0 else "Dead").to_list()
labels = dataset_val['outcome']
val = {'Text': notes, 'label': labels}
val_df = pd.DataFrame(val)

Only consider dataset range between 0 and 118 (total: 118)


In [27]:
val_df.to_csv('Data/valid.csv', index = False)

In [28]:
notes = [NoteGenerator.clean_note(note_generator.substitute(r)) for _, r in dataset_test.iterrows()]
old_size_notes = len(notes)
start = 0  # 25000
end = len(notes)
notes = notes[start:end]
#dataset = dataset.iloc[start:end]
print(f"Only consider dataset range between {start} and {end} (total: {old_size_notes})")
#labels = dataset_test['outcome'].apply(lambda x: "Alive" if x == 0 else "Dead").to_list()
labels = dataset_test['outcome']
test = {'Text': notes, 'label': labels}
test_df = pd.DataFrame(test)

Only consider dataset range between 0 and 118 (total: 118)


In [29]:
test_df.to_csv('Data/test.csv', index = False)

In [None]:
import json
import openai
import requests
from tenacity import retry, wait_random_exponential, stop_after_attempt
from termcolor import colored

# From the IPython.display package, import display and Markdown
from IPython.display import display, Markdown

model="gpt-3.5-turbo"

In [None]:
train_df.iloc[16]['Text']

In [None]:
train_df.iloc[16]['Labels']

In [None]:
import os
from openai import OpenAI

client = OpenAI(
    api_key= # your secret key here
)

context = 'The dataset is based on a study that aimed to develop and validate a prediction model for in-hospital mortality among ICU-admitted heart failure patients using the Medical Information Mart for Intensive Care (MIMIC-III) database. The dataset includes comprehensive information such as demographics, vital signs, comorbidities, and laboratory test results for heart failure patients. The primary outcome is the in-hospital mortality of these patients.'

# Define a prompt for sentiment analysis
prompt = "From the given information, predict the outcome {Alive,Dead} after the patient is in ICU. Give reasoning for your prediction and the last token should be the prediction {Alice,Dead}"

# User-provided text for analysis
user_text = train_df.iloc[16]['Text']

# Concatenate the prompt and user-provided text
input_text = f" <Context> {context} <Patient Inforamtion> {user_text} <Prediction Task> {prompt} "

chat_completion = client.chat.completions.create(
    messages=[
        
            {"role": "system", "content": "You are a medical diagnosis assistant."},
            {"role": "user", "content": input_text},
    ],
    model="gpt-3.5-turbo",
)

response_content = chat_completion.choices[0].message.content

# Extract the last token
last_token = response_content.split()[-1]

print("Response:", response_content)
print("Last token:", last_token)

In [None]:
def get_prediction(row):

    user_text = row['Text']
    prompt = "From the given information, predict the outcome {Alive,Dead} after the patient is in ICU. Give reasoning for your prediction and the last token should be the prediction {Alice,Dead}"

    input_text = f"{prompt} {user_text}"

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": input_text,
            }
        ],
        model="gpt-3.5-turbo",
    )
    response_content = chat_completion.choices[0].message.content

    # Extract the last token
    last_token = response_content.split()[-1]
    return last_token

In [None]:
test_df['prediction'] = test_df.apply(lambda row: get_prediction(row), axis=1)

In [None]:
test_df['pred'] = test_df['prediction'].apply(lambda x: 1 if x == 'Dead' else 0)

In [None]:
test_df

In [None]:
test_df['prediction'].value_counts()

In [None]:
test_df['Labels'].value_counts()