In [None]:
import os
import sys
sys.path.insert(0, "/data/zeljko/projects/medgpt/")

os.environ['HF_DATASETS_CACHE'] = "/data/zeljko/.cache/huggingface"
os.environ['TRANSFORMERS_CACHE'] = "/data/zeljko/.cache/huggingface"

%load_ext autoreload
%autoreload 2

In [None]:
import pickle
import pandas as pd
import random
import re
from medgpt.config import Config

In [None]:
config = Config(yaml_path='/home/ubuntu/projects/medgpt/configs/mimic.yaml')

In [None]:
data = {}
for i in range(10):
    d = pickle.load(open(f"{config.path.dataset.annotated_documents}/clean_part_{i}.pickle", 'rb'))
    data.update(d)
    print(i)

In [None]:
df = pd.read_csv(config.path.raw_data)
df

In [None]:
def clean_text(text):
    # Limit a char to be repeated at most 3 times
    clean_text = re.sub("([^0-9])\\1{3,}", "\\1\\1\\1", text)
    return clean_text

In [None]:
# Replace strings in text with CUIs
cuis = set()
not_exists = set()
i = 0
for ind, row in df.iterrows():
    i += 1
    id = str(row.row_id)
    if id in data and data[id]:
        new_text = row.text
        for item in reversed(data[id]['entities']):
            cuis.add(item['cui'])
            if new_text[item['start']] != ' ' and new_text[item['start'] - 1] != ' ':
                #print(id, 'Space')
                #print(item['start'], row.text[item['start'] - 10:item['end']], item['cui'], item['name'], '\n')
                new_text = f"{new_text[0:item['start']]} {item['cui']}{new_text[item['end']:]}"
            else:
                new_text = f"{new_text[0:item['start']]}{item['cui']}{new_text[item['end']:]}"
        # Clean after replacements are done
        new_text = clean_text(new_text)
        #print(row.text, new_text)
        #print("*"*100)
        df.iat[ind, 5] = new_text
    else:
        not_exists.add(id)
    if i % 100000 == 0:
        print(i)

In [None]:
len(cuis), len(not_exists)

In [None]:
print(df[df.row_id == 1245792].text.values[0])

In [None]:
pickle.dump(cuis, open(config.path.dataset.cuis_in_text, 'wb'))

In [None]:
df.to_csv(config.path.dataset.text_with_codes)

In [None]:
# We will remove all columns except text, subject_id, row id and add a column called source = 'MIMIC-III-text'

In [None]:
df = df.drop(['chartdate', 'charttime', 'category'], axis=1)
df['source'] = 'MIMIC-III-text'

In [None]:
# Split to train test based on subject ID
all_subject_ids = list(set(df.subject_id.values))
r_inds = random.sample([i for i in range(len(all_subject_ids))], k=len(all_subject_ids))
split = int(0.95 * len(all_subject_ids))
train_subject_ids = set([all_subject_ids[i] for i in r_inds[0:split]])
test_subject_ids = set([all_subject_ids[i] for i in r_inds[split:]])
print(len(train_subject_ids), len(test_subject_ids), len(all_subject_ids))
assert (len(train_subject_ids) + len(test_subject_ids)) == len(all_subject_ids)

In [None]:
train_mask = [True if x in train_subject_ids else False for x in df.subject_id.values]
train_df = df[train_mask]

test_mask = [True if x in test_subject_ids else False for x in df.subject_id.values]
test_df = df[test_mask]

print(len(train_df), len(test_df), len(df))
assert (len(train_df) + len(test_df)) == len(df)

In [None]:
train_df.to_csv(config.path.dataset.train_df, index=False)

In [None]:
test_df.to_csv(config.path.dataset.test_df, index=False)

In [None]:
train_text = " ".join(train_df.text)
test_text = " ".join(test_df.text)

In [None]:
len(train_text), len(test_text)

# Prepare for GPT training

This is used if we want to train an LLM on the data, pure LLM on all the data.

In [None]:
from transformers import GPT2Config, GPT2LMHeadModel, Trainer, TrainingArguments, AutoTokenizer, pipeline, GPT2Tokenizer
from medgpt.tokenizers.simple_map_tokenizer import SimpleMapTokenizer
from medgpt.models.utils import add_cuis_to_model_and_tokenizer
from medgpt.tokenizers.utils import pack_text
import re
import pickle
from medcat.cat import CAT
import pandas as pd
import datasets
import random
import math

In [None]:
NUM_PROC = 8

In [None]:
# Load the tokenizer - this tokenizer has to have the codes in it
gpt_tokenizer = AutoTokenizer.from_pretrained(config.path.tokenizer.self)

In [None]:
# Each csv should have two columns <source>: MIMIC-text, MIMIC-timelines, Wikipedia, ... and <text>: text
dataset = datasets.load_dataset('csv', data_files={'train': [config.path.dataset.train_df],
                                                   'test': [config.path.dataset.test_df]})
dataset

In [None]:
encoded_dataset = dataset.map(lambda examples: gpt_tokenizer(examples['text']), 
                              batched=True, 
                              num_proc=NUM_PROC, 
                              remove_columns=["text"])

In [None]:
# Check one example
print(gpt_tokenizer.decode(encoded_dataset['train'][7]["input_ids"]))

In [None]:
encoded_dataset.save_to_disk(config.path.dataset.text_with_codes_prepared)

In [None]:
encoded_dataset_loaded = datasets.load_from_disk(config.path.dataset.text_with_codes_prepared)
encoded_dataset_loaded

In [None]:
encoded_dataset_loaded

In [None]:
print(gpt_tokenizer.decode(encoded_dataset_loaded['train'][7]["input_ids"]))