In [1]:
import sqlite3
import pandas as pd
import re

## Sqlite3 Setup

In [2]:
#relative path to where the mimic3.db file is
db_path = 'F:/mimic-iii-clinical-database-1.4/mimic3.db'
#connection object to db
sqliteConnection = sqlite3.connect(db_path)
#cursor/pointer
mimiciii = sqliteConnection.cursor()

## Helper Functions

In [3]:
def get_col_names(cursor, table_name):
    '''
    Retrieves the column names for a table in a sqlite3 db.
    ------
    cursor: sqliteConnection cursor object
    table_name: table_name to get column names for
    '''
    cursor.execute(f"""
    SELECT sql FROM sqlite_master WHERE name='{table_name}';
    """)
    
    res = mimiciii.fetchall()
    cols = re.findall(r'\"\w+\"', res[0][0])
    return [x[1:-1] for x in cols]

In [4]:
def get_df_from_table_from_db(cursor, table_name, num_rows='*', skip_cols=[]):
    '''
    Retreives table from sqlite3 db in form of df
    ------
    cursor: sqliteConnection cursor object
    table_name: name of table to get from cursor db
    num_rows: number of rows to retrieve (or '*' for all rows)
    skip_cols: list of columns to skip in the retrieval
    '''
    col_names = get_col_names(cursor, table_name)
    
    use_cols = [col for col in col_names if col not in skip_cols]
    
    if num_rows == '*':
        query = f'''select {', '.join(use_cols)} from {table_name};'''
    else:
        query = f'''select {', '.join(use_cols)} from {table_name} limit {num_rows};'''
        
    cursor.execute(query)
    rows = cursor.fetchall()
    
    return pd.DataFrame(rows, columns=use_cols)

In [5]:
def get_tables_list_from_db(cursor):
    cursor.execute("""
    select name from sqlite-master where type='table';
    """)
    table_names = [table[0] for table in cursor.fetchall()]
    return table_names

In [6]:
def to_int(x):
    if not x or np.isnan(x):
        return 0
    return int(x)

## Demo

In [7]:
#get admission table as df
admission = get_df_from_table_from_db(mimiciii, 'admissions')
admission = admission.apply(lambda x: x.astype(str).str.upper())
admission.HADM_ID = admission.HADM_ID.astype('int64')
admission.SUBJECT_ID = admission.SUBJECT_ID.astype('int64')

admission.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,NONE,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,PRIVATE,NONE,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,NONE,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,MEDICARE,NONE,CATHOLIC,MARRIED,WHITE,NONE,NONE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,NONE,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,MEDICARE,ENGL,CATHOLIC,MARRIED,WHITE,NONE,NONE,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,NONE,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,PRIVATE,NONE,PROTESTANT QUAKER,SINGLE,WHITE,NONE,NONE,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,NONE,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,PRIVATE,NONE,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1


In [8]:
septic_adm = admission[admission.DIAGNOSIS.str.contains('SEPTICEMIA') | admission.DIAGNOSIS.str.contains('SEPSIS')]

septic_adm['SUBJECT_ID'] = septic_adm['SUBJECT_ID'].astype(int)
septic_adm.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  septic_adm['SUBJECT_ID'] = septic_adm['SUBJECT_ID'].astype(int)


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
11,32,33,176176,2116-12-23 22:30:00,2116-12-27 12:05:00,NONE,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,MEDICARE,NONE,PROTESTANT QUAKER,MARRIED,UNKNOWN/NOT SPECIFIED,2116-12-23 14:42:00,2116-12-23 23:41:00,SEPSIS;TELEMETRY,0,1
19,40,38,185910,2166-08-10 00:28:00,2166-09-04 11:30:00,NONE,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,LONG TERM CARE HOSPITAL,MEDICARE,NONE,CATHOLIC,WIDOWED,WHITE,NONE,NONE,ACUTE MYOCARDIAL INFARCTION-SEPSIS,0,1
24,458,357,122609,2198-11-01 22:36:00,2198-11-14 14:20:00,NONE,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,PRIVATE,ENGL,NOT SPECIFIED,MARRIED,WHITE,2198-11-01 18:01:00,2198-11-01 23:06:00,SEPSIS,0,1
37,471,366,134462,2164-11-18 20:27:00,2164-11-22 15:18:00,NONE,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,MEDICARE,ENGL,CATHOLIC,SINGLE,HISPANIC OR LATINO,2164-11-18 10:52:00,2164-11-18 21:31:00,SEPSIS,0,1
64,62,62,116009,2113-02-15 00:19:00,2113-02-19 15:30:00,NONE,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,MEDICARE,NONE,NOT SPECIFIED,MARRIED,PATIENT DECLINED TO ANSWER,2113-02-14 19:55:00,2113-02-15 01:17:00,"SEPSIS,URINARY TRACT INFECTION",0,1


In [9]:
septic_drg = drgcodes[drgcodes.DESCRIPTION.str.contains('SEPTIC')]

septic_drg['SUBJECT_ID'] = septic_drg['SUBJECT_ID'].astype(int)
septic_drg.head()

NameError: name 'drgcodes' is not defined

In [None]:
print(septic_adm.shape)
print(septic_drg.shape)

In [None]:
#get sepsis notes list ofr subject id
tmp1 = pd.merge(septic_adm, noteevents_meta, on='SUBJECT_ID', how='left').ROW_ID_y.to_list()
tmp2 = pd.merge(septic_drg, noteevents_meta, on='SUBJECT_ID', how='left').ROW_ID_y.to_list()
notes_set = set(tmp1 + tmp2)

In [None]:
#sepsis notes
septic_notes = get_df_from_table_from_db(mimiciii, 'noteevents', skip_cols=['CHARTDATE', 'CHARTTIME', 'STORETIME'])
septic_notes = septic_notes[septic_notes.SUBJECT_ID.isin(notes_set)]
septic_notes.head()

In [None]:
#pre-proc sepsis notes
septic_notes_list = septic_notes.TEXT.to_list()

filtered_notes = []
stop_words = set(stopwords.words('english') + ['*'])

for note in tqdm(septic_notes_list):
    tokens = word_tokenize(note)
    filtered_text = ' '.join([word for word in tokens if word.lower() not in stop_words])
    filtered_notes.append(filtered_text)

In [None]:
septic_notes['FILTERED_TEXT'] = filtered_notes
septic_notes.to_csv('septic_notes_filtered.csv', index=None)

In [None]:
'sepsis' in sample_chart

In [None]:
sample_chart = septic_notes.iloc[0].FILTERED_TEXT
llm_chain = LLMChain(prompt=prompt,
                     llm=ChatOpenAI(model_name='gpt-3.5-turbo', temperature=1e-5))

In [None]:
sample_chart = septic_notes.iloc[0].FILTERED_TEXT[:1000]
llm_chain = LLMChain(prompt=prompt,
                     llm=HuggingFaceHub(repo_id='google/flan-t5-xxl',
                                        model_kwargs={
                                            'temperature': 1e-5
                                        }))

In [None]:
print(llm_chain.run(question=question, chart=sample_chart))

In [None]:
template = """Question: {question}

Answer: """
prompt = PromptTemplate(
        template=template,
    input_variables=['question']
)

# user question
question = "Which NFL team won the Super Bowl in the 2010 season?"

In [None]:
# initialize Hub LLM
hub_llm = HuggingFaceHub(
        repo_id='google/flan-t5-xxl',
    model_kwargs={'temperature':1e-10}
)

# create prompt template > LLM chain
llm_chain = LLMChain(
    prompt=prompt,
    llm=hub_llm
)

# ask the user question about NFL 2010
print(llm_chain.run(question))

In [None]:
db = SQLDatabase.from_uri('sqlite:///F:/mimic-iii-clinical-database-1.4/mimic3.db')
db_chain = SQLDatabaseChain(llm=HuggingFaceHub(repo_id='google/flan-t5-xxl',
                                        model_kwargs={
                                            'temperature': .01
                                        }),
                            database=db,
                            verbose=True)

db_chain.run("How many tables are there?")