In [1]:
import sys
sys.path.append('../')

import regex as re
import pandas as pd
from src.utils.preprocessing import (load_data,
                                    save_data,
                                    build_combined_discharge,
                                    get_bhc_input,
                                    extract_clean_inputs,
                                    remove_unecessary_tokens)

from src.utils.token_count import get_token_list, get_token_count, plot_token_count
from src.utils.format_change import dataframe_to_jsonl
from tqdm import tqdm
tqdm.pandas()

%load_ext autoreload
%autoreload 2

In [2]:
data_path = '../data/'

In [9]:
train_discharge = load_data(data_path + 'train/discharge.csv.gz')
train_targets = load_data(data_path + 'train/discharge_target.csv.gz')
train_combined_discharge = build_combined_discharge(train_discharge, train_targets)

valid_discharge = load_data(data_path + 'valid/discharge.csv.gz')
valid_targets = load_data(data_path + 'valid/discharge_target.csv.gz') 
valid_combined_discharge = build_combined_discharge(valid_discharge, valid_targets)

print(len(train_combined_discharge))
print(len(valid_combined_discharge))

68785
14719


**Stage1: First cleaning by meaningful extraction**
i.e. keep and organize relevent sections without looking to much in details

**Stage2 :Removing some not-such-important sections**
- Remove `social_history` and `family_history`

**Stage3 cleaning of specific token groups**
- Replacing 2 or more `__` by only one `_`
- Replacing at least 2 `======` by `\n`

**Stage4 adding the key behind the BHC section of input_of_di**

**Stage5 adding the prompt to the input**

**Stage6 filtering out the samples longer than 2k**

**Stage7 loading into the jsonl files**

In [4]:
clean2_bhc_train_input = extract_clean_inputs(train_combined_discharge,
                        features_to_include=[
                                'sex',
                                'allergies',
                                'chief_complaint',
                                'major_surgical_procedures',
                                'history_of_present_illness',
                                'past_medical_history',
                                'physical_exam',
                                'pertinent_results',
                            ])

clean2_bhc_valid_input = extract_clean_inputs(valid_combined_discharge,
                        features_to_include=[
                                'sex',
                                'allergies',
                                'chief_complaint',
                                'major_surgical_procedures',
                                'history_of_present_illness',
                                'past_medical_history',
                                'physical_exam',
                                'pertinent_results',
                            ])

100%|██████████| 68785/68785 [00:43<00:00, 1582.76it/s]
100%|██████████| 14719/14719 [00:09<00:00, 1556.95it/s]


In [5]:
clean2_di_train_input = extract_clean_inputs(train_combined_discharge,
                        features_to_include=[
                                'medication_on_admission',
                                'discharge_medications',
                                'discharge_disposition',
                                'discharge_diagnosis',
                                'discharge_condition',
                            ])

clean2_di_valid_input = extract_clean_inputs(valid_combined_discharge,
                        features_to_include=[
                                'medication_on_admission',
                                'discharge_medications',
                                'discharge_disposition',
                                'discharge_diagnosis',
                                'discharge_condition',
                            ])

100%|██████████| 68785/68785 [00:04<00:00, 15702.80it/s]
100%|██████████| 14719/14719 [00:00<00:00, 15552.42it/s]


In [10]:
system_prompt1 = "You are a medical assistant. Your task is to write the brief hospital course corresponding to the following hospital discharge.\n\n"
system_prompt2 = "You are a medical assistant. Your task is to write the discharge instructions corresponding to the following hospital discharge.\n\n"

In [12]:
# add the clean2_bhc/di_train_input and clean2_bhc/di_valid_input to the original dataframes as the new columns
train_combined_discharge['input_of_bhc'] = clean2_bhc_train_input
valid_combined_discharge['input_of_bhc'] = clean2_bhc_valid_input
train_combined_discharge['input_of_di'] = clean2_di_train_input
valid_combined_discharge['input_of_di'] = clean2_di_valid_input

# remove the unecessary tokens from the input_of_bhc and input_of_di
train_combined_discharge['input_of_bhc'] = train_combined_discharge['input_of_bhc'].progress_apply(remove_unecessary_tokens)
valid_combined_discharge['input_of_bhc'] = valid_combined_discharge['input_of_bhc'].progress_apply(remove_unecessary_tokens)

# add some key words into the input_of_di
train_combined_discharge['input_of_di'] = "Brief Hospital Course:\n" + train_combined_discharge['brief_hospital_course'] + "\n\n" + clean2_di_train_input
valid_combined_discharge['input_of_di'] = "Brief Hospital Course:\n" + valid_combined_discharge['brief_hospital_course'] + "\n\n" + clean2_di_valid_input

# add the system prompt into the input_of_bhc and input_of_di
train_combined_discharge['input_of_bhc'] = system_prompt1 + train_combined_discharge['input_of_bhc']
valid_combined_discharge['input_of_bhc'] = system_prompt1 + valid_combined_discharge['input_of_bhc']
train_combined_discharge['input_of_di'] = system_prompt2 + train_combined_discharge['input_of_di']
valid_combined_discharge['input_of_di'] = system_prompt2 + valid_combined_discharge['input_of_di']

# count the number of tokens in train_combined_discharge
train_combined_discharge['bhc_token_count'] = train_combined_discharge['brief_hospital_course'].progress_apply(get_token_count)
train_combined_discharge['input_of_bhc_token_count'] = train_combined_discharge['input_of_bhc'].progress_apply(get_token_count)
train_combined_discharge['di_token_count'] = train_combined_discharge['discharge_instructions'].progress_apply(get_token_count)
train_combined_discharge['input_of_di_token_count'] = train_combined_discharge['input_of_di'].progress_apply(get_token_count)

# count the number of tokens in valid_combined_discharge
valid_combined_discharge['bhc_token_count'] = valid_combined_discharge['brief_hospital_course'].progress_apply(get_token_count)
valid_combined_discharge['input_of_bhc_token_count'] = valid_combined_discharge['input_of_bhc'].progress_apply(get_token_count)
valid_combined_discharge['di_token_count'] = valid_combined_discharge['discharge_instructions'].progress_apply(get_token_count)
valid_combined_discharge['input_of_di_token_count'] = valid_combined_discharge['input_of_di'].progress_apply(get_token_count)

100%|██████████| 68785/68785 [00:42<00:00, 1611.26it/s]
100%|██████████| 14719/14719 [00:09<00:00, 1608.74it/s]
100%|██████████| 68785/68785 [01:23<00:00, 820.95it/s] 
100%|██████████| 68785/68785 [02:58<00:00, 384.77it/s]
100%|██████████| 68785/68785 [00:51<00:00, 1341.69it/s]
100%|██████████| 68785/68785 [02:43<00:00, 421.78it/s]
100%|██████████| 14719/14719 [00:18<00:00, 779.19it/s]
100%|██████████| 14719/14719 [00:37<00:00, 388.48it/s]
100%|██████████| 14719/14719 [00:11<00:00, 1334.12it/s]
100%|██████████| 14719/14719 [00:34<00:00, 423.97it/s]


In [46]:
bhc_train_df = pd.DataFrame()
bhc_valid_df = pd.DataFrame()
di_train_df = pd.DataFrame()
di_valid_df = pd.DataFrame()

bhc_train_df = train_combined_discharge[train_combined_discharge['input_of_bhc_token_count'] + train_combined_discharge['bhc_token_count'] < 2048]
bhc_valid_df = valid_combined_discharge[valid_combined_discharge['input_of_bhc_token_count'] + valid_combined_discharge['bhc_token_count'] < 2048]
di_train_df = train_combined_discharge[train_combined_discharge['input_of_di_token_count'] + train_combined_discharge['di_token_count'] < 2048]
di_valid_df = valid_combined_discharge[valid_combined_discharge['input_of_di_token_count'] + valid_combined_discharge['di_token_count'] < 2048]

print('the percentage of the bhc train set remaining after filtering:', len(bhc_train_df)/len(train_combined_discharge))
print('the percentage of the bhc valid set remaining after filtering:', len(bhc_valid_df)/len(valid_combined_discharge))
print('the percentage of the di train set remaining after filtering:', len(di_train_df)/len(train_combined_discharge))
print('the percentage of the di valid set remaining after filtering:', len(di_valid_df)/len(valid_combined_discharge))

the percentage of the bhc train set remaining after filtering: 0.46239732499818276
the percentage of the bhc valid set remaining after filtering: 0.46463754331136625
the percentage of the di train set remaining after filtering: 0.7097041506142328
the percentage of the di valid set remaining after filtering: 0.714111012976425


In [47]:
BHC_train_dataset = data_path + 'train/BHC_train_dataset.jsonl'
DI_train_dataset = data_path + 'train/DI_train_dataset.jsonl'
BHC_valid_dataset = data_path + 'valid/BHC_valid_dataset.jsonl'
DI_valid_dataset = data_path + 'valid/DI_valid_dataset.jsonl'

dataframe_to_jsonl(bhc_train_df, attributes=['input_of_bhc', 'brief_hospital_course'], keys=['prompt', 'gold'], file_path=BHC_train_dataset)
dataframe_to_jsonl(di_train_df, attributes=['input_of_di', 'discharge_instructions'], keys=['prompt', 'gold'], file_path=DI_train_dataset)
dataframe_to_jsonl(bhc_valid_df, attributes=['input_of_bhc', 'brief_hospital_course'], keys=['prompt', 'gold'], file_path=BHC_valid_dataset)
dataframe_to_jsonl(di_valid_df, attributes=['input_of_di', 'discharge_instructions'], keys=['prompt', 'gold'], file_path=DI_valid_dataset)

31806it [00:03, 8488.72it/s]
48817it [00:05, 8954.77it/s]
6839it [00:00, 8849.23it/s]
10511it [00:01, 8800.50it/s]
