In [2]:
import sys
sys.path.append('../')

import regex as re
import pandas as pd
from src.utils.preprocessing import (load_data,
                                    save_data,
                                    build_combined_discharge,
                                    get_bhc_input,
                                    extract_clean_inputs,
                                    remove_unecessary_tokens)

from src.utils.token_count import get_token_list, get_token_count, plot_token_count
from src.utils.format_change import dataframe_to_jsonl
from tqdm import tqdm
tqdm.pandas()

%load_ext autoreload
%autoreload 2

In [3]:
data_path = '../data/'

In [18]:
train_discharge = load_data(data_path + 'train/discharge.csv.gz')
train_targets = load_data(data_path + 'train/discharge_target.csv.gz')
train_combined_discharge = build_combined_discharge(train_discharge, train_targets)

valid_discharge = load_data(data_path + 'valid/discharge.csv.gz')
valid_targets = load_data(data_path + 'valid/discharge_target.csv.gz') 
valid_combined_discharge = build_combined_discharge(valid_discharge, valid_targets)

test_discharge = load_data(data_path + 'test_phase_1/discharge.csv.gz')
test_targets = load_data(data_path + 'test_phase_1/discharge_target.csv.gz')
test_combined_discharge = build_combined_discharge(test_discharge, test_targets)

print(len(train_combined_discharge))
print(len(valid_combined_discharge))
print(len(test_combined_discharge))

68785
14719
14702


**Stage1: First cleaning by meaningful extraction**
i.e. keep and organize relevent sections without looking to much in details

**Stage2 :Removing some not-such-important sections**
- Remove `social_history` and `family_history`

**Stage3 cleaning of specific token groups**
- Replacing 2 or more `__` by only one `_`
- Replacing at least 2 `======` by `\n`

**Stage4 adding the key behind the BHC section of input_of_di**

**Stage5 adding the prompt to the input**

**Stage6 filtering out the samples longer than 2k**

**Stage7 loading into the jsonl files**

In [19]:
clean_bhc_train_input = extract_clean_inputs(train_combined_discharge,
                        features_to_include=[
                                'sex',
                                'allergies',
                                'chief_complaint',
                                'major_surgical_procedures',
                                'history_of_present_illness',
                                'past_medical_history',
                                'physical_exam',
                                'pertinent_results',
                            ])

clean_bhc_valid_input = extract_clean_inputs(valid_combined_discharge,
                        features_to_include=[
                                'sex',
                                'allergies',
                                'chief_complaint',
                                'major_surgical_procedures',
                                'history_of_present_illness',
                                'past_medical_history',
                                'physical_exam',
                                'pertinent_results',
                            ])

  0%|          | 0/68785 [00:00<?, ?it/s]

100%|██████████| 68785/68785 [00:42<00:00, 1637.72it/s]
100%|██████████| 14719/14719 [00:09<00:00, 1627.05it/s]


In [20]:
clean_di_train_input = extract_clean_inputs(train_combined_discharge,
                        features_to_include=[
                                'medication_on_admission',
                                'discharge_medications',
                                'discharge_disposition',
                                'discharge_diagnosis',
                                'discharge_condition',
                            ])

clean_di_valid_input = extract_clean_inputs(valid_combined_discharge,
                        features_to_include=[
                                'medication_on_admission',
                                'discharge_medications',
                                'discharge_disposition',
                                'discharge_diagnosis',
                                'discharge_condition',
                            ])

  0%|          | 0/68785 [00:00<?, ?it/s]

100%|██████████| 68785/68785 [00:04<00:00, 16142.75it/s]
100%|██████████| 14719/14719 [00:00<00:00, 17078.50it/s]


In [21]:
clean_bhc_test_input = extract_clean_inputs(test_combined_discharge,
                                              features_to_include=[
                                'sex',
                                'allergies',
                                'chief_complaint',
                                'major_surgical_procedures',
                                'history_of_present_illness',
                                'past_medical_history',
                                'physical_exam',
                                'pertinent_results',
                            ])

clean_di_test_input = extract_clean_inputs(test_combined_discharge,
                        features_to_include=[
                                'medication_on_admission',
                                'discharge_medications',
                                'discharge_disposition',
                                'discharge_diagnosis',
                                'discharge_condition',
                            ])

  0%|          | 0/14702 [00:00<?, ?it/s]

100%|██████████| 14702/14702 [00:08<00:00, 1636.92it/s]
100%|██████████| 14702/14702 [00:00<00:00, 16837.44it/s]


In [22]:
system_prompt1 = "You are a medical assistant. Your task is to write the brief hospital course corresponding to the following hospital discharge.\n\n"
system_prompt2 = "You are a medical assistant. Your task is to write the discharge instructions corresponding to the following hospital discharge.\n\n"

In [23]:
# add the clean2_bhc/di_train_input and clean2_bhc/di_valid_input to the original dataframes as the new columns
train_combined_discharge['input_of_bhc'] = clean_bhc_train_input
valid_combined_discharge['input_of_bhc'] = clean_bhc_valid_input
test_combined_discharge['input_of_bhc'] = clean_bhc_test_input
train_combined_discharge['input_of_di'] = clean_di_train_input
valid_combined_discharge['input_of_di'] = clean_di_valid_input
test_combined_discharge['input_of_di'] = clean_di_test_input

# add some key words into the input_of_di
train_combined_discharge['input_of_di'] = "Brief Hospital Course:\n" + train_combined_discharge['brief_hospital_course'] + "\n\n" + clean_di_train_input
valid_combined_discharge['input_of_di'] = "Brief Hospital Course:\n" + valid_combined_discharge['brief_hospital_course'] + "\n\n" + clean_di_valid_input
test_combined_discharge['input_of_di'] = "Brief Hospital Course:\n" + test_combined_discharge['brief_hospital_course'] + "\n\n" + clean_di_test_input

# add the system prompt into the input_of_bhc and input_of_di
train_combined_discharge['input_of_bhc'] = system_prompt1 + train_combined_discharge['input_of_bhc']
valid_combined_discharge['input_of_bhc'] = system_prompt1 + valid_combined_discharge['input_of_bhc']
test_combined_discharge['input_of_bhc'] = system_prompt1 + test_combined_discharge['input_of_bhc']
train_combined_discharge['input_of_di'] = system_prompt2 + train_combined_discharge['input_of_di']
valid_combined_discharge['input_of_di'] = system_prompt2 + valid_combined_discharge['input_of_di']
test_combined_discharge['input_of_di'] = system_prompt2 + test_combined_discharge['input_of_di']

# remove the unecessary tokens from the input_of_bhc and input_of_di
train_combined_discharge['input_of_bhc'] = train_combined_discharge['input_of_bhc'].progress_apply(remove_unecessary_tokens)
valid_combined_discharge['input_of_bhc'] = valid_combined_discharge['input_of_bhc'].progress_apply(remove_unecessary_tokens)
test_combined_discharge['input_of_bhc'] = test_combined_discharge['input_of_bhc'].progress_apply(remove_unecessary_tokens)
train_combined_discharge['input_of_di'] = train_combined_discharge['input_of_di'].progress_apply(remove_unecessary_tokens)
valid_combined_discharge['input_of_di'] = valid_combined_discharge['input_of_di'].progress_apply(remove_unecessary_tokens)
test_combined_discharge['input_of_di'] = test_combined_discharge['input_of_di'].progress_apply(remove_unecessary_tokens)

# count the number of tokens in train_combined_discharge
train_combined_discharge['bhc_token_count'] = train_combined_discharge['brief_hospital_course'].progress_apply(get_token_count)
train_combined_discharge['input_of_bhc_token_count'] = train_combined_discharge['input_of_bhc'].progress_apply(get_token_count)
train_combined_discharge['di_token_count'] = train_combined_discharge['discharge_instructions'].progress_apply(get_token_count)
train_combined_discharge['input_of_di_token_count'] = train_combined_discharge['input_of_di'].progress_apply(get_token_count)

# count the number of tokens in valid_combined_discharge
valid_combined_discharge['bhc_token_count'] = valid_combined_discharge['brief_hospital_course'].progress_apply(get_token_count)
valid_combined_discharge['input_of_bhc_token_count'] = valid_combined_discharge['input_of_bhc'].progress_apply(get_token_count)
valid_combined_discharge['di_token_count'] = valid_combined_discharge['discharge_instructions'].progress_apply(get_token_count)
valid_combined_discharge['input_of_di_token_count'] = valid_combined_discharge['input_of_di'].progress_apply(get_token_count)

# count the number of tokens in test_combined_discharge
test_combined_discharge['bhc_token_count'] = test_combined_discharge['brief_hospital_course'].progress_apply(get_token_count)
test_combined_discharge['input_of_bhc_token_count'] = test_combined_discharge['input_of_bhc'].progress_apply(get_token_count)
test_combined_discharge['di_token_count'] = test_combined_discharge['discharge_instructions'].progress_apply(get_token_count)
test_combined_discharge['input_of_di_token_count'] = test_combined_discharge['input_of_di'].progress_apply(get_token_count)

100%|██████████| 68785/68785 [00:44<00:00, 1531.92it/s]
100%|██████████| 14719/14719 [00:09<00:00, 1528.00it/s]
100%|██████████| 14702/14702 [00:09<00:00, 1536.51it/s]
100%|██████████| 68785/68785 [00:41<00:00, 1648.40it/s]
100%|██████████| 14719/14719 [00:08<00:00, 1658.20it/s]
100%|██████████| 14702/14702 [00:08<00:00, 1656.50it/s]
100%|██████████| 68785/68785 [01:24<00:00, 814.00it/s] 
100%|██████████| 68785/68785 [02:59<00:00, 382.72it/s]
100%|██████████| 68785/68785 [00:51<00:00, 1323.22it/s]
100%|██████████| 68785/68785 [02:38<00:00, 432.82it/s]
100%|██████████| 14719/14719 [00:19<00:00, 771.41it/s]
100%|██████████| 14719/14719 [00:38<00:00, 384.32it/s]
100%|██████████| 14719/14719 [00:11<00:00, 1312.28it/s]
100%|██████████| 14719/14719 [00:33<00:00, 435.78it/s]
100%|██████████| 14702/14702 [00:19<00:00, 766.79it/s]
100%|██████████| 14702/14702 [00:38<00:00, 385.81it/s]
100%|██████████| 14702/14702 [00:11<00:00, 1324.14it/s]
100%|██████████| 14702/14702 [00:33<00:00, 435.81it/s]


In [24]:
bhc_train_df = pd.DataFrame()
bhc_valid_df = pd.DataFrame()
bhc_test_df = pd.DataFrame()
di_train_df = pd.DataFrame()
di_valid_df = pd.DataFrame()
di_test_df = pd.DataFrame()

bhc_train_df = train_combined_discharge[train_combined_discharge['input_of_bhc_token_count'] + train_combined_discharge['bhc_token_count'] < 2048]
bhc_valid_df = valid_combined_discharge[valid_combined_discharge['input_of_bhc_token_count'] + valid_combined_discharge['bhc_token_count'] < 2048]
bhc_test_df = test_combined_discharge[test_combined_discharge['input_of_bhc_token_count'] + test_combined_discharge['bhc_token_count'] < 2048]
di_train_df = train_combined_discharge[train_combined_discharge['input_of_di_token_count'] + train_combined_discharge['di_token_count'] < 2048]
di_valid_df = valid_combined_discharge[valid_combined_discharge['input_of_di_token_count'] + valid_combined_discharge['di_token_count'] < 2048]
di_test_df = test_combined_discharge[test_combined_discharge['input_of_di_token_count'] + test_combined_discharge['di_token_count'] < 2048]

print('the percentage of the bhc train set remaining after filtering:', len(bhc_train_df)/len(train_combined_discharge))
print('the percentage of the bhc valid set remaining after filtering:', len(bhc_valid_df)/len(valid_combined_discharge))
print('the percentage of the bhc test set remaining after filtering:', len(bhc_test_df)/len(test_combined_discharge))
print('the percentage of the di train set remaining after filtering:', len(di_train_df)/len(train_combined_discharge))
print('the percentage of the di valid set remaining after filtering:', len(di_valid_df)/len(valid_combined_discharge))
print('the percentage of the di test set remaining after filtering:', len(di_test_df)/len(test_combined_discharge))

the percentage of the bhc train set remaining after filtering: 0.46239732499818276
the percentage of the bhc valid set remaining after filtering: 0.46463754331136625
the percentage of the bhc test set remaining after filtering: 0.45891715412869
the percentage of the di train set remaining after filtering: 0.7505851566475249
the percentage of the di valid set remaining after filtering: 0.7531761668591617
the percentage of the di test set remaining after filtering: 0.7516664399401441


In [25]:
BHC_train_dataset = data_path + 'train/BHC_train_dataset_v2.jsonl'
DI_train_dataset = data_path + 'train/DI_train_dataset_v2.jsonl'
BHC_valid_dataset = data_path + 'valid/BHC_valid_dataset_v2.jsonl'
DI_valid_dataset = data_path + 'valid/DI_valid_dataset_v2.jsonl'
BHC_test_dataset = data_path + 'test_phase_1/BHC_test_dataset_v2.jsonl'
DI_test_dataset = data_path + 'test_phase_1/DI_test_dataset_v2.jsonl'

dataframe_to_jsonl(bhc_train_df, attributes=['input_of_bhc', 'brief_hospital_course'], keys=['prompt', 'gold'], file_path=BHC_train_dataset)
dataframe_to_jsonl(di_train_df, attributes=['input_of_di', 'discharge_instructions'], keys=['prompt', 'gold'], file_path=DI_train_dataset)
dataframe_to_jsonl(bhc_valid_df, attributes=['input_of_bhc', 'brief_hospital_course'], keys=['prompt', 'gold'], file_path=BHC_valid_dataset)
dataframe_to_jsonl(di_valid_df, attributes=['input_of_di', 'discharge_instructions'], keys=['prompt', 'gold'], file_path=DI_valid_dataset)
dataframe_to_jsonl(bhc_test_df, attributes=['input_of_bhc', 'brief_hospital_course'], keys=['prompt', 'gold'], file_path=BHC_test_dataset)
dataframe_to_jsonl(di_test_df, attributes=['input_of_di', 'discharge_instructions'], keys=['prompt', 'gold'], file_path=DI_test_dataset)

31806it [00:03, 8375.50it/s]
51629it [00:05, 8654.17it/s]
6839it [00:00, 8628.55it/s]
11086it [00:01, 8616.07it/s]
6747it [00:00, 8531.90it/s]
11051it [00:01, 8621.22it/s]


## Rerun the code to construct the test set with hadm_id

In [5]:
test_discharge = load_data(data_path + 'test_phase_1/discharge.csv.gz')
test_targets = load_data(data_path + 'test_phase_1/discharge_target.csv.gz')
test_combined_discharge = build_combined_discharge(test_discharge, test_targets)

In [6]:
test_combined_discharge.head()

Unnamed: 0,hadm_id,text,discharge_instructions,brief_hospital_course
0,27988844,\nName: ___ Unit No: ___\n...,INSTRUCTIONS AFTER ORTHOPAEDIC SURGERY:\n\n- Y...,The patient presented to the emergency departm...
1,26381316,\nName: ___ Unit No: ___...,"Dear Ms. ___,\n\nYou were admitted to ___ for ...",___ yo f with h/o recently diagnosed metastati...
2,24947999,\nName: ___ Unit No: ___\n \nAdmi...,"Dear ___,\n\n___ were admitted to ___ on ___ w...","___ year old female with history of HTN, CVA, ..."
3,27060146,\nName: ___ Unit No: ___\n \n...,"Dear ___,\n\n___ did you come to the hospital?...",BRIEF SUMMARY:\n==============\n___ year old w...
4,28058085,\nName: ___ Unit No: ___\n \n...,"Dear Ms. ___, \n\nIt was a pleasure taking car...",Ms. ___ is an ___ year old woman with history ...


In [7]:
print(len(test_combined_discharge))

14702


In [8]:
clean_bhc_test_input = extract_clean_inputs(test_combined_discharge,
                                              features_to_include=[
                                'sex',
                                'allergies',
                                'chief_complaint',
                                'major_surgical_procedures',
                                'history_of_present_illness',
                                'past_medical_history',
                                'physical_exam',
                                'pertinent_results',
                            ])

clean_di_test_input = extract_clean_inputs(test_combined_discharge,
                        features_to_include=[
                                'medication_on_admission',
                                'discharge_medications',
                                'discharge_disposition',
                                'discharge_diagnosis',
                                'discharge_condition',
                            ])

100%|██████████| 14702/14702 [00:09<00:00, 1605.58it/s]
100%|██████████| 14702/14702 [00:00<00:00, 16115.22it/s]


In [9]:
system_prompt1 = "You are a medical assistant. Your task is to write the brief hospital course corresponding to the following hospital discharge.\n\n"
system_prompt2 = "You are a medical assistant. Your task is to write the discharge instructions corresponding to the following hospital discharge.\n\n"

In [10]:
test_combined_discharge['input_of_bhc'] = clean_bhc_test_input
test_combined_discharge['input_of_di'] = clean_di_test_input

test_combined_discharge['input_of_bhc'] = system_prompt1 + test_combined_discharge['input_of_bhc']
test_combined_discharge['input_of_bhc'] = test_combined_discharge['input_of_bhc'].progress_apply(remove_unecessary_tokens)

100%|██████████| 14702/14702 [00:09<00:00, 1476.07it/s]


In [11]:
test_combined_discharge.head()

Unnamed: 0,hadm_id,text,discharge_instructions,brief_hospital_course,input_of_bhc,input_of_di
0,27988844,\nName: ___ Unit No: ___\n...,INSTRUCTIONS AFTER ORTHOPAEDIC SURGERY:\n\n- Y...,The patient presented to the emergency departm...,you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...
1,26381316,\nName: ___ Unit No: ___...,"Dear Ms. ___,\n\nYou were admitted to ___ for ...",___ yo f with h/o recently diagnosed metastati...,you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...
2,24947999,\nName: ___ Unit No: ___\n \nAdmi...,"Dear ___,\n\n___ were admitted to ___ on ___ w...","___ year old female with history of HTN, CVA, ...",you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...
3,27060146,\nName: ___ Unit No: ___\n \n...,"Dear ___,\n\n___ did you come to the hospital?...",BRIEF SUMMARY:\n==============\n___ year old w...,you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...
4,28058085,\nName: ___ Unit No: ___\n \n...,"Dear Ms. ___, \n\nIt was a pleasure taking car...",Ms. ___ is an ___ year old woman with history ...,you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...


In [None]:
clean_bhc_test_input = extract_clean_inputs(test_combined_discharge,
                                              features_to_include=[
                                'sex',
                                'allergies',
                                'chief_complaint',
                                'major_surgical_procedures',
                                'history_of_present_illness',
                                'past_medical_history',
                                'physical_exam',
                                'pertinent_results',
                            ])

In [12]:
bhc_sections = ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'history_of_present_illness', 'past_medical_history', 'physical_exam', 'pertinent_results']

for section in bhc_sections:
    print(section)
    test_combined_discharge[section] = extract_clean_inputs(test_combined_discharge, features_to_include=[section])
    test_combined_discharge[section] = test_combined_discharge[section].progress_apply(remove_unecessary_tokens)
    test_combined_discharge[section + "_tokens"] = test_combined_discharge[section].progress_apply(get_token_count)
    
print("finished")

sex


100%|██████████| 14702/14702 [00:00<00:00, 44385.64it/s]
100%|██████████| 14702/14702 [00:00<00:00, 23795.43it/s]
100%|██████████| 14702/14702 [00:00<00:00, 18452.71it/s]


allergies


100%|██████████| 14702/14702 [00:00<00:00, 53487.26it/s]
100%|██████████| 14702/14702 [00:00<00:00, 21889.23it/s]
100%|██████████| 14702/14702 [00:01<00:00, 11257.81it/s]


chief_complaint


100%|██████████| 14702/14702 [00:00<00:00, 44019.58it/s]
100%|██████████| 14702/14702 [00:00<00:00, 22080.86it/s]
100%|██████████| 14702/14702 [00:01<00:00, 12496.20it/s]


major_surgical_procedures


100%|██████████| 14702/14702 [00:00<00:00, 50740.99it/s]
100%|██████████| 14702/14702 [00:00<00:00, 19964.17it/s]
100%|██████████| 14702/14702 [00:01<00:00, 9720.22it/s]


history_of_present_illness


100%|██████████| 14702/14702 [00:02<00:00, 5003.24it/s]
100%|██████████| 14702/14702 [00:04<00:00, 3407.76it/s]
100%|██████████| 14702/14702 [00:14<00:00, 1044.76it/s]


past_medical_history


100%|██████████| 14702/14702 [00:00<00:00, 15185.32it/s]
100%|██████████| 14702/14702 [00:01<00:00, 10530.39it/s]
100%|██████████| 14702/14702 [00:04<00:00, 3388.44it/s]


physical_exam


100%|██████████| 14702/14702 [00:01<00:00, 8617.82it/s]
100%|██████████| 14702/14702 [00:02<00:00, 6681.52it/s]
100%|██████████| 14702/14702 [00:07<00:00, 1978.07it/s]


pertinent_results


100%|██████████| 14702/14702 [00:02<00:00, 4984.98it/s]
100%|██████████| 14702/14702 [00:02<00:00, 5155.32it/s]
100%|██████████| 14702/14702 [00:12<00:00, 1186.86it/s]

finished





In [13]:
test_combined_discharge.head()

Unnamed: 0,hadm_id,text,discharge_instructions,brief_hospital_course,input_of_bhc,input_of_di,sex,sex_tokens,allergies,allergies_tokens,...,major_surgical_procedures,major_surgical_procedures_tokens,history_of_present_illness,history_of_present_illness_tokens,past_medical_history,past_medical_history_tokens,physical_exam,physical_exam_tokens,pertinent_results,pertinent_results_tokens
0,27988844,\nName: ___ Unit No: ___\n...,INSTRUCTIONS AFTER ORTHOPAEDIC SURGERY:\n\n- Y...,The patient presented to the emergency departm...,you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...,sex: \nF\n,8,\nallergies: \nomeprazole / iodine and iodide ...,33,...,\n\nmajor surgical or invasive procedure: \nst...,21,\n\nhistory of present illness: \nREASON FOR C...,104,\n\npast medical history: \n- GERD \n - hyper...,65,\n\nphysical exam: \ngeneral: well-appearing f...,25,\n\npertinent results: \n,10
1,26381316,\nName: ___ Unit No: ___...,"Dear Ms. ___,\n\nYou were admitted to ___ for ...",___ yo f with h/o recently diagnosed metastati...,you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...,sex: \nF\n,8,\nallergies: \nsulfa (sulfonamide antibiotics)...,32,...,\n\nmajor surgical or invasive procedure: \nnone,15,\n\nhistory of present illness: \nthis is a _ ...,572,\n\npast medical history: \nPMH: \n# high grad...,131,\n\nphysical exam: \nadmission physical exam: ...,564,\n\npertinent results: \nADMIT LABS:\n\n_ 04:3...,326
2,24947999,\nName: ___ Unit No: ___\n \nAdmi...,"Dear ___,\n\n___ were admitted to ___ on ___ w...","___ year old female with history of HTN, CVA, ...",you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...,sex: \nF\n,8,\nallergies: \nno known allergies / adverse dr...,22,...,\n\nmajor surgical or invasive procedure: \n_ ...,23,\n\nhistory of present illness: \n_ year old f...,1032,\n\npast medical history: \n- hypertension \n...,119,\n\nphysical exam: \nADMISSION PHYSICAL EXAM:\...,606,\n\npertinent results: \n ADMISSION LABS \n_ 0...,450
3,27060146,\nName: ___ Unit No: ___\n \n...,"Dear ___,\n\n___ did you come to the hospital?...",BRIEF SUMMARY:\n==============\n___ year old w...,you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...,sex: \nF\n,8,\nallergies: \nno known allergies / adverse dr...,22,...,\n\nmajor surgical or invasive procedure: \n_:...,60,\n\nhistory of present illness: \nMs. _ is an ...,374,\n\npast medical history: \n- hypertension \n...,253,\n\nphysical exam: \nADMISSION EXAM:\n\n\n vit...,439,\n\npertinent results: \nADMISSION LABS:\n\n\n...,451
4,28058085,\nName: ___ Unit No: ___\n \n...,"Dear Ms. ___, \n\nIt was a pleasure taking car...",Ms. ___ is an ___ year old woman with history ...,you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...,sex: \nF\n,8,\nallergies: \nno known allergies / adverse dr...,22,...,\n\nmajor surgical or invasive procedure: \nnone,15,\n\nhistory of present illness: \n_ y/o female...,433,\n\npast medical history: \n- hypertension \n...,253,\n\nphysical exam: \n\n\nADMISSION PHYSICAL EX...,366,\n\npertinent results: \n\n\nADMISSION LABS\n\...,789


In [31]:
select_strategy = [
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'history_of_present_illness', 'physical_exam', 'pertinent_results', 'past_medical_history'],
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'history_of_present_illness', 'physical_exam', 'pertinent_results'],
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'history_of_present_illness', 'physical_exam', 'past_medical_history'],
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'history_of_present_illness', 'pertinent_results', 'past_medical_history'],
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'history_of_present_illness', 'physical_exam'],
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'history_of_present_illness', 'pertinent_results'],
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'history_of_present_illness', 'past_medical_history'],
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'history_of_present_illness'],
    ['sex', 'allergies', 'chief_complaint', 'history_of_present_illness'],
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'physical_exam', 'pertinent_results', 'past_medical_history'],
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'physical_exam', 'pertinent_results'],
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'physical_exam', 'past_medical_history'],
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'pertinent_results', 'past_medical_history'],
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'physical_exam'],
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'pertinent_results'],
    ['sex', 'allergies', 'chief_complaint', 'major_surgical_procedures', 'past_medical_history'],
]

In [41]:
test_combined_discharge['input_of_bhc_new'] = test_combined_discharge['input_of_bhc']

for index, row in test_combined_discharge.iterrows():
    total_tokens = 0
    for select in select_strategy:
        total_tokens = 0
        for section in select:
            total_tokens += row[section + "_tokens"]
        if total_tokens < 2048:
            final_select = select
            break
        if select in select_strategy[-1]:
            final_select = select_strategy[-1]
            print("no suitable strategy found")
    test_combined_discharge.at[index, 'input_of_bhc_new'] = extract_clean_inputs(test_combined_discharge.iloc[index], features_to_include=final_select)
test_combined_discharge['input_of_bhc_new'] = system_prompt1 + test_combined_discharge['input_of_bhc_new']
test_combined_discharge['input_of_bhc_new'] = test_combined_discharge['input_of_bhc_new'].progress_apply(remove_unecessary_tokens)
test_combined_discharge['input_of_bhc_new_tokens'] = test_combined_discharge['input_of_bhc_new'].progress_apply(get_token_count)
# check how many rows where its input_of_bhc_new_tokens is greater than 2048
print(len(test_combined_discharge[test_combined_discharge['input_of_bhc_new_tokens'] > 2048]))
   
    
            
        


100%|██████████| 14702/14702 [00:08<00:00, 1790.04it/s]
100%|██████████| 14702/14702 [00:31<00:00, 471.60it/s]

108





In [42]:
test_combined_discharge.head()

Unnamed: 0,hadm_id,text,discharge_instructions,brief_hospital_course,input_of_bhc,input_of_di,sex,sex_tokens,allergies,allergies_tokens,...,history_of_present_illness,history_of_present_illness_tokens,past_medical_history,past_medical_history_tokens,physical_exam,physical_exam_tokens,pertinent_results,pertinent_results_tokens,input_of_bhc_new,input_of_bhc_new_tokens
0,27988844,\nName: ___ Unit No: ___\n...,INSTRUCTIONS AFTER ORTHOPAEDIC SURGERY:\n\n- Y...,The patient presented to the emergency departm...,you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...,sex: \nF\n,8,\nallergies: \nomeprazole / iodine and iodide ...,33,...,\n\nhistory of present illness: \nREASON FOR C...,104,\n\npast medical history: \n- GERD \n - hyper...,65,\n\nphysical exam: \ngeneral: well-appearing f...,25,\n\npertinent results: \n,10,you are a medical assistant. your task is to w...,291
1,26381316,\nName: ___ Unit No: ___...,"Dear Ms. ___,\n\nYou were admitted to ___ for ...",___ yo f with h/o recently diagnosed metastati...,you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...,sex: \nF\n,8,\nallergies: \nsulfa (sulfonamide antibiotics)...,32,...,\n\nhistory of present illness: \nthis is a _ ...,572,\n\npast medical history: \nPMH: \n# high grad...,131,\n\nphysical exam: \nadmission physical exam: ...,564,\n\npertinent results: \nADMIT LABS:\n\n_ 04:3...,326,you are a medical assistant. your task is to w...,1680
2,24947999,\nName: ___ Unit No: ___\n \nAdmi...,"Dear ___,\n\n___ were admitted to ___ on ___ w...","___ year old female with history of HTN, CVA, ...",you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...,sex: \nF\n,8,\nallergies: \nno known allergies / adverse dr...,22,...,\n\nhistory of present illness: \n_ year old f...,1032,\n\npast medical history: \n- hypertension \n...,119,\n\nphysical exam: \nADMISSION PHYSICAL EXAM:\...,606,\n\npertinent results: \n ADMISSION LABS \n_ 0...,450,you are a medical assistant. your task is to w...,1842
3,27060146,\nName: ___ Unit No: ___\n \n...,"Dear ___,\n\n___ did you come to the hospital?...",BRIEF SUMMARY:\n==============\n___ year old w...,you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...,sex: \nF\n,8,\nallergies: \nno known allergies / adverse dr...,22,...,\n\nhistory of present illness: \nMs. _ is an ...,374,\n\npast medical history: \n- hypertension \n...,253,\n\nphysical exam: \nADMISSION EXAM:\n\n\n vit...,439,\n\npertinent results: \nADMISSION LABS:\n\n\n...,451,you are a medical assistant. your task is to w...,1645
4,28058085,\nName: ___ Unit No: ___\n \n...,"Dear Ms. ___, \n\nIt was a pleasure taking car...",Ms. ___ is an ___ year old woman with history ...,you are a medical assistant. your task is to w...,Medications on Admission: \nThe Preadmission M...,sex: \nF\n,8,\nallergies: \nno known allergies / adverse dr...,22,...,\n\nhistory of present illness: \n_ y/o female...,433,\n\npast medical history: \n- hypertension \n...,253,\n\nphysical exam: \n\n\nADMISSION PHYSICAL EX...,366,\n\npertinent results: \n\n\nADMISSION LABS\n\...,789,you are a medical assistant. your task is to w...,1913
