# Pre-processing Notebook

Kolton Hauck
BMI6114

Pre-processing Notebook
<hr>

This notebook outlines the steps required to pre-process the Synthea data for use.

Synthea outputs an `output/` folder with all tables included. What is output is a JSON file in the following (simplified) representation:

```JSON
[
    {
        "patient_id": "",
        "label": "",
        "encounters": {
            "encounter": [],
            "conditions": [],
            "careplans": [],
            "procedures": []
        }
    }
]
```

## Imports

In [None]:
import pandas as pd
import json
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
import torch
from tqdm import tqdm
import pickle
import json
from utils import synthea_table_information, remove_text_inside_parentheses

## Pre-processing

Take the raw .csv files for each table of interest and: clean, filter


### Generate focused JSON files for tables used for better processing


In [None]:
# read in data
synthea_path = "output/csv/"

#### patients ####
patients_df = pd.read_csv(synthea_path + "patients.csv", header=None)
patients_df.columns = synthea_table_information["patients.csv"]["columns"]

#### encounters ####
encounters_df = pd.read_csv(synthea_path + "encounters.csv", header=None)
encounters_df.columns = synthea_table_information["encounters.csv"]["columns"]
encounters_df["Start"] = pd.to_datetime(encounters_df["Start"], utc=True)
encounters_df["Stop"] = pd.to_datetime(encounters_df["Stop"], utc=True)
encounters_df.sort_values(by="Start", inplace=True)
encounters_df.fillna("", inplace=True)
encounters_df["Description"] = encounters_df["Description"].apply(remove_text_inside_parentheses)
encounters_df["ReasonDescription"] = encounters_df["ReasonDescription"].apply(remove_text_inside_parentheses)

#### careplans ####
careplans_df = pd.read_csv(synthea_path + "careplans.csv", header=None, dtype=str)
careplans_df.columns = synthea_table_information["careplans.csv"]["columns"]
careplans_df.drop(columns=["Id", "Start", "Stop", "Patient", "Code", "ReasonCode"], inplace=True)
careplans_df.fillna("", inplace=True)
careplans_df["Description"] = careplans_df["Description"].apply(remove_text_inside_parentheses)
careplans_df["ReasonDescription"] = careplans_df["ReasonDescription"].apply(remove_text_inside_parentheses)

#### conditions ####
conditions_df = pd.read_csv(synthea_path + "conditions.csv", header=None, dtype=str)
conditions_df.columns = synthea_table_information["conditions.csv"]["columns"]
conditions_df["Description"] = conditions_df["Description"].apply(remove_text_inside_parentheses)
conditions_df.drop(columns=["Start", "Stop", "Patient", "Code"], inplace=True)

#### procedures ####
procedures_df = pd.read_csv(synthea_path + "procedures.csv", header=None, dtype=str)
procedures_df.columns = synthea_table_information["procedures.csv"]["columns"]
procedures_df["Start"] = pd.to_datetime(procedures_df["Start"], utc=True)
procedures_df["Stop"] = pd.to_datetime(procedures_df["Stop"], utc=True)
procedures_df.sort_values(by="Start", inplace=True)
procedures_df.fillna("", inplace=True)
procedures_df["Description"] = procedures_df["Description"].apply(remove_text_inside_parentheses)
procedures_df["ReasonDescription"] = procedures_df["ReasonDescription"].apply(remove_text_inside_parentheses)
# drop later -->> # procedures_df.drop(columns=["Start", "Stop", "Patient", "Code", "ReasonCode", "Base_Cost"], inplace=True)

In [None]:
# CAREPLANS
grouped_tmp = careplans_df.groupby("Encounter").apply(lambda x: x.drop("Encounter", axis=1).to_dict(orient="records")).to_dict()
with open("data_processed/tmp1/careplans_1.json", "w") as j_file:
    json.dump(grouped_tmp, j_file)

# CONDITIONS
grouped_tmp = conditions_df.groupby("Encounter").apply(lambda x: x.drop("Encounter", axis=1).to_dict(orient="records")).to_dict()
with open("data_processed/tmp1/conditions_1.json", "w") as j_file:
    json.dump(grouped_tmp, j_file)

# PROCEDURES
# grouped_tmp = procedures_df.drop(columns=[""]).groupby("Encounter").apply(lambda x: x.drop("Encounter", axis=1).to_dict(orient="records")).to_dict()
# with open("data_processed/tmp1/procedures_1.json", "w") as j_file:
#     json.dump(grouped_tmp, j_file)
grouped_tmp = {}

for index, row in tqdm(procedures_df.iterrows(), total=procedures_df.shape[0]):
    curr_id = row["Encounter"]

    curr_dict = {
        "Description": row["Description"],
        "ReasonDescription": row["ReasonDescription"]
    }

    if curr_id in grouped_tmp:
        grouped_tmp[curr_id].append(curr_dict)
    else:
        grouped_tmp[curr_id] = [curr_dict]

with open("data_processed/tmp1/procedures_1.json", "w") as j_file:
    json.dump(grouped_tmp, j_file)

# ENCOUNTERS
grouped_tmp = {}

for index, row in tqdm(encounters_df.iterrows(), total=encounters_df.shape[0]):
    grouped_tmp[row["Id"]] = {
        "Description": row["Description"],
        "ReasonDescription": row["ReasonDescription"]
    }

with open("data_processed/tmp1/encounters_1.json", "w") as j_file:
    json.dump(grouped_tmp, j_file)

### Generate Dataset

In [3]:
with open("data_processed/tmp1/careplans_1.json", "r") as j_file:
    careplans_json = json.load(j_file)
    print("loaded careplans")

with open("data_processed/tmp1/conditions_1.json", "r") as j_file:
    conditions_json = json.load(j_file)
    print("loaded conditions")

with open("data_processed/tmp1/encounters_1.json", "r") as j_file:
    encounters_json = json.load(j_file)
    print("loaded encounters")

with open("data_processed/tmp1/procedures_1.json", "r") as j_file:
    procedures_json = json.load(j_file)
    print("loaded procedures")


loaded careplans
loaded conditions
loaded encounters
loaded procedures


In [None]:
synthea_path = "output/csv/"

#### patients ####
patients_df = pd.read_csv(synthea_path + "patients.csv", header=None)
patients_df.columns = synthea_table_information["patients.csv"]["columns"]
print("loaded patients")

#### encounters ####
encounters_df = pd.read_csv(synthea_path + "encounters.csv", header=None)
encounters_df.columns = synthea_table_information["encounters.csv"]["columns"]
encounters_df["Start"] = pd.to_datetime(encounters_df["Start"], utc=True)
encounters_df["Stop"] = pd.to_datetime(encounters_df["Stop"], utc=True)
encounters_df.sort_values(by="Start", inplace=True)
encounters_df.fillna("", inplace=True)
encounters_df["Description"] = encounters_df["Description"].apply(remove_text_inside_parentheses)
encounters_df["ReasonDescription"] = encounters_df["ReasonDescription"].apply(remove_text_inside_parentheses)
print("loaded encounters")

#### procedures ####
procedures_df = pd.read_csv(synthea_path + "procedures.csv", header=None, dtype=str)
procedures_df.columns = synthea_table_information["procedures.csv"]["columns"]
procedures_df["Start"] = pd.to_datetime(procedures_df["Start"], utc=True)
procedures_df["Stop"] = pd.to_datetime(procedures_df["Stop"], utc=True)
procedures_df.sort_values(by="Start", inplace=True)
procedures_df.fillna("", inplace=True)
procedures_df["Description"] = procedures_df["Description"].apply(remove_text_inside_parentheses)
procedures_df["ReasonDescription"] = procedures_df["ReasonDescription"].apply(remove_text_inside_parentheses)
print("loaded procedures")


In [5]:
def load_enc_dict(encounter_id):
    """
    1. given single encounter id
    2. get encounter level description and reasondescription
    3. get all descriptions for conditions associated with encounter id
    4. get all description, reasondescriptions for all careplans assocaited with encounter id
    5. get all description, reasondescriptions for all procedures associated with encounter id
    """
    curr_enc = {}

    # enc -> description / reasondescription - one
    curr_enc["encounter"] = encounters_json[encounter_id]

    # enc conditions -> description - multiple
    curr_enc["conditions"] = conditions_json.get(encounter_id, [{"Description": ""}])

    # enc careplans -> description / reasondescription - multiple
    curr_enc["careplans"] = careplans_json.get(encounter_id, [{"Description": "", "ReasonDescription": ""}])

    # enc procedures -> description / reasondescription - multiple
    curr_enc["procedures"] = procedures_json.get(encounter_id, [{"Description": "", "ReasonDescription": ""}])

    return curr_enc

In [6]:
def get_patient_dict(patient_id, label, date_poi):
    """
    1. given a single patient id and it's label and date of poi
    2. get encounter df
    3. if in-class: drop later encounters (past poi)
    4. get patient level data
    5. append encounter level data
    """
    patient_dict = patients_df[patients_df.Id == patient_id].iloc[0].to_dict()

    patient_encounters_df = encounters_df[encounters_df.Patient == patient_id]
    
    if date_poi:
        patient_encounters_df = patient_encounters_df[patient_encounters_df["Start"] <= date_poi] # drop encounters after/of poi

    curr_pat = {
        "patient_id": patient_dict["Id"],
        "label": label,
        "lat": patient_dict["Lat"],
        "lon": patient_dict["Lon"],
        "encounters": []
    }

    encounter_ids = patient_encounters_df.Id.to_list()
    for encounter_id in encounter_ids:
        curr_pat["encounters"].append(load_enc_dict(encounter_id))

    return curr_pat

In [None]:
# v2
"""
1. get all patient ids
2. get the patients with poi
    2.1 retain only the encounters up to the date of poi
    2.2 also get which patients are in class
3. iter over each patient
    3.1 for each patient: get the 'dict' object
"""

patient_ids = patients_df.Id.to_list()

poi = "Plain chest X-ray"
patients_in_class = procedures_df[procedures_df["Description"] == poi][["Patient", "Start"]].drop_duplicates()
#procedures_dropped = procedures_df.drop(columns=["Start", "Stop", "Patient", "Code", "ReasonCode", "Base_Cost"])

# Preprocess to avoid repeated operations inside the loop
patient_set = set(patients_in_class["Patient"])

idx = patients_in_class.groupby("Patient")["Start"].idxmax() # focus on the last poi given to the patient (eg patient can have multiple chest x-rays )
patients_in_class_last_procedure = patients_in_class.loc[idx]
last_procedure_dict = patients_in_class_last_procedure.set_index("Patient")["Start"].to_dict()

patients_dicts = []
# chunk_size = 500

for patient_id in tqdm(patient_ids):
    label = patient_id in patient_set
    date_poi = last_procedure_dict.get(patient_id, None)

    patients_dicts.append(get_patient_dict(patient_id, label, date_poi))

    # write to file in chunks to avoid memory overflow
    # if len(patients_dicts) >= chunk_size:
    #     with open("data_processed/json/all.json", "a") as file:
    #         json.dump(patients_dicts, file)
    #     patients_dicts = []  # Reset the list for the next chunk

if patients_dicts:
    with open("data_processed/json/all.json", "w") as file:
        json.dump(patients_dicts, file)


### Generate Embeddings

In [None]:
def embed_descriptions(descriptions, model_directory, batch_size=32):
    """ Given a list of descriptions, a model directory, and a device, returns embeddings for the descriptions."""
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    tokenizer = AutoTokenizer.from_pretrained(model_directory)

    # Load pre-trained model (weights)
    model = AutoModel.from_pretrained(model_directory)
    model.to(device)
    model.eval()  # Put the model in "evaluation" mode, which turns off dropout
    print(f"Model loaded | Generating embeddings with batch size={batch_size}")

    # Prepare inputs as a dictionary for the model
    inputs = tokenizer(descriptions, padding=True, truncation=True, return_tensors="pt", max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to the specified device

    # Process in batches with tqdm for progress tracking
    embeddings = []
    for i in tqdm(range(0, len(descriptions), batch_size), desc="Generating Embeddings"):
        batch = {k: v[i:i + batch_size] for k, v in inputs.items()}  # Create batch for the current iteration
        with torch.no_grad():
            outputs = model(**batch)

        # Extract pooled output embeddings
        batch_embeddings = outputs.pooler_output
        embeddings.append(batch_embeddings)

    # Concatenate batched embeddings
    embeddings = torch.cat(embeddings, dim=0)

    return {descriptions[i]: embeddings[i] for i in range(len(descriptions))}

In [None]:
# generate embeddings

with open("data/all_v2.json","r") as j_file:
    data = json.load(j_file)

# get set of all descriptions / reasondescriptions
texts = set()

for patient in tqdm(data):
    for encounter in patient["encounters"]:
        texts.add(encounter["encounter"]["Description"]) # append enc description
        texts.add(encounter["encounter"]["ReasonDescription"]) # append enc reasondescription

        texts = texts | set([_["Description"] for _ in encounter["conditions"]]) # condition desc

        texts = texts | set([_["Description"] for _ in encounter["careplans"]]) # careplan descs

        texts = texts | set([_["ReasonDescription"] for _ in encounter["careplans"]]) # careplan reas descs

        texts = texts | set([_["Description"] for _ in encounter["procedures"]]) # proc descs

        texts = texts | set([_["ReasonDescription"] for _ in encounter["procedures"]]) # proc reas descs

text2embeddings = embed_descriptions(list(texts), "FremyCompany/BioLORD-2023")

for key, value in text2embeddings.items():
    text2embeddings[key] = value.cpu()

with open('data/text2embeddings.pkl', 'wb') as f:
    pickle.dump(text2embeddings, f)