# Example on how to import MEDS data format

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

from twinweaver import (
    convert_meds_to_dtc,
    DataManager,
    DataSplitterEvents,
    ConverterInstruction,
    Config,
)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


## Synethetic example

Here we provide synthetic example data as generated by Gemini.

In [2]:
code_metadata_list = [
    # Static Measurements
    {"code": "GENDER/Female", "description": "Female sex"},
    {"code": "GENDER/Male", "description": "Male sex"},
    {"code": "GENETIC/BRCA1_pos", "description": "BRCA1 gene mutation"},
    # Visit and Administrative Codes
    {
        "code": "ADMISSION/Outpatient",
        "description": "Admission for an outpatient clinic visit",
    },
    {
        "code": "ADMISSION/Inpatient",
        "description": "Admission to the hospital for an inpatient stay",
    },
    {
        "code": "DISCHARGE/Outpatient",
        "description": "Discharge from an outpatient clinic visit",
    },
    {
        "code": "DISCHARGE/Inpatient",
        "description": "Discharge from an inpatient hospital stay",
    },
    {
        "code": "NOTE/FollowUp",
        "description": "Clinical note for a follow-up appointment",
    },
    # Diagnosis Codes (ICD-10-CM)
    {
        "code": "ICD10CM/C34.90",
        "description": "Malignant neoplasm of unspecified part of unspecified bronchus or lung",
    },
    {"code": "ICD10CM/C61", "description": "Malignant neoplasm of prostate"},
    # Symptom Codes
    {"code": "SYMPTOM/Cough", "description": "Patient reports a persistent cough"},
    # Procedure Codes (CPT)
    {
        "code": "CPT/71250",
        "description": "Procedure code for a CT scan of the thorax without contrast",
    },
    {
        "code": "CPT/32408",
        "description": "Procedure code for a core needle biopsy of the lung or mediastinum",
    },
    {
        "code": "CPT/55700",
        "description": "Procedure code for a needle biopsy of the prostate",
    },
    {
        "code": "CPT/55840",
        "description": "Procedure code for a radical retropubic prostatectomy",
    },
    # Lab Codes (LOINC)
    {
        "code": "LOINC/6690-2",
        "description": "Leukocytes [#/volume] in Blood by Automated count (White Blood Cell Count)",
    },
    {
        "code": "LOINC/2039-6",
        "description": "Carcinoembryonic Ag [Mass/volume] in Serum or Plasma (CEA Tumor Marker)",
    },
    {
        "code": "LOINC/59261-8",
        "description": "Comprehensive metabolic 2014 panel - Serum or Plasma",
    },
    {
        "code": "LOINC/2857-1",
        "description": "Prostate specific Ag [Mass/volume] in Serum or Plasma (PSA Test)",
    },
    # Medication Codes
    {
        "code": "RX/Cisplatin",
        "description": "Administration of Cisplatin chemotherapy agent",
    },
    # Death
    {"code": "DEATH", "description": "Death"},
]

code_metadata_df = pd.DataFrame(code_metadata_list)


# Patient Events DataFrame
patient_events_list = [
    # Patient 101: Jane Doe (Lung Cancer) - Assigned to 'train' split
    # Static data
    {
        "subject_id": 101,
        "time": pd.NaT,
        "code": "GENDER/Female",
        "numeric_value": np.nan,
        "text_value": "Female",
    },
    {
        "subject_id": 101,
        "time": pd.NaT,
        "code": "GENETIC/BRCA1_pos",
        "numeric_value": 1,
        "text_value": "Positive",
    },
    # Visit 1 (Week 2, 2024): Diagnosis
    {
        "subject_id": 101,
        "time": datetime(2024, 1, 8),
        "code": "ADMISSION/Outpatient",
        "numeric_value": np.nan,
        "text_value": np.nan,
    },
    {
        "subject_id": 101,
        "time": datetime(2024, 1, 8),
        "code": "SYMPTOM/Cough",
        "numeric_value": np.nan,
        "text_value": "Persistent for 2 months",
    },
    {
        "subject_id": 101,
        "time": datetime(2024, 1, 8),
        "code": "LOINC/6690-2",
        "numeric_value": 12.5,
        "text_value": np.nan,
    },
    {
        "subject_id": 101,
        "time": datetime(2024, 1, 8),
        "code": "CPT/71250",
        "numeric_value": np.nan,
        "text_value": "Nodule found in right lung",
    },
    {
        "subject_id": 101,
        "time": datetime(2024, 1, 8),
        "code": "CPT/32408",
        "numeric_value": np.nan,
        "text_value": np.nan,
    },
    {
        "subject_id": 101,
        "time": datetime(2024, 1, 8),
        "code": "ICD10CM/C34.90",
        "numeric_value": np.nan,
        "text_value": "Primary Diagnosis",
    },
    {
        "subject_id": 101,
        "time": datetime(2024, 1, 8),
        "code": "DISCHARGE/Outpatient",
        "numeric_value": np.nan,
        "text_value": np.nan,
    },
    # Visit 2 (Week 4, 2024): Treatment
    {
        "subject_id": 101,
        "time": datetime(2024, 1, 22),
        "code": "ADMISSION/Inpatient",
        "numeric_value": np.nan,
        "text_value": np.nan,
    },
    {
        "subject_id": 101,
        "time": datetime(2024, 1, 22),
        "code": "LOINC/59261-8",
        "numeric_value": np.nan,
        "text_value": "All values within normal limits",
    },
    {
        "subject_id": 101,
        "time": datetime(2024, 1, 22),
        "code": "RX/Cisplatin",
        "numeric_value": np.nan,
        "text_value": "Cisplatin",
    },
    {
        "subject_id": 101,
        "time": datetime(2024, 1, 22),
        "code": "DISCHARGE/Inpatient",
        "numeric_value": np.nan,
        "text_value": np.nan,
    },
    # Visit 3 (Week 8, 2024): Follow-up
    {
        "subject_id": 101,
        "time": datetime(2024, 2, 19),
        "code": "ADMISSION/Outpatient",
        "numeric_value": np.nan,
        "text_value": np.nan,
    },
    {
        "subject_id": 101,
        "time": datetime(2024, 2, 19),
        "code": "NOTE/FollowUp",
        "numeric_value": np.nan,
        "text_value": "Patient tolerated first cycle well.",
    },
    {
        "subject_id": 101,
        "time": datetime(2024, 2, 19),
        "code": "LOINC/2039-6",
        "numeric_value": 50.2,
        "text_value": np.nan,
    },
    {
        "subject_id": 101,
        "time": datetime(2024, 2, 19),
        "code": "DISCHARGE/Outpatient",
        "numeric_value": np.nan,
        "text_value": np.nan,
    },
    # Patient 202: John Smith (Prostate Cancer) - Assigned to 'held_out' split
    # Static data
    {
        "subject_id": 202,
        "time": pd.NaT,
        "code": "GENDER/Male",
        "numeric_value": np.nan,
        "text_value": "Male",
    },
    # Visit 1 (Week 10, 2024): Diagnosis
    {
        "subject_id": 202,
        "time": datetime(2024, 3, 4),
        "code": "ADMISSION/Outpatient",
        "numeric_value": np.nan,
        "text_value": np.nan,
    },
    {
        "subject_id": 202,
        "time": datetime(2024, 3, 4),
        "code": "LOINC/2857-1",
        "numeric_value": 15.1,
        "text_value": np.nan,
    },
    {
        "subject_id": 202,
        "time": datetime(2024, 3, 4),
        "code": "CPT/55700",
        "numeric_value": np.nan,
        "text_value": "Biopsy taken",
    },
    {
        "subject_id": 202,
        "time": datetime(2024, 3, 4),
        "code": "ICD10CM/C61",
        "numeric_value": np.nan,
        "text_value": "Primary Diagnosis",
    },
    {
        "subject_id": 202,
        "time": datetime(2024, 3, 4),
        "code": "DISCHARGE/Outpatient",
        "numeric_value": np.nan,
        "text_value": np.nan,
    },
    # Visit 2 (Week 14, 2024): Treatment (Surgery)
    {
        "subject_id": 202,
        "time": datetime(2024, 4, 1),
        "code": "ADMISSION/Inpatient",
        "numeric_value": np.nan,
        "text_value": np.nan,
    },
    {
        "subject_id": 202,
        "time": datetime(2024, 4, 1),
        "code": "CPT/55840",
        "numeric_value": np.nan,
        "text_value": "Surgical procedure completed.",
    },
    {
        "subject_id": 202,
        "time": datetime(2024, 4, 1),
        "code": "LOINC/6690-2",
        "numeric_value": 8.2,
        "text_value": np.nan,
    },
    {
        "subject_id": 202,
        "time": datetime(2024, 4, 1),
        "code": "DISCHARGE/Inpatient",
        "numeric_value": np.nan,
        "text_value": np.nan,
    },
    # Visit 3 (Week 20, 2024): Follow-up
    {
        "subject_id": 202,
        "time": datetime(2024, 5, 13),
        "code": "ADMISSION/Outpatient",
        "numeric_value": np.nan,
        "text_value": np.nan,
    },
    {
        "subject_id": 202,
        "time": datetime(2024, 5, 13),
        "code": "LOINC/2857-1",
        "numeric_value": 0.1,
        "text_value": np.nan,
    },
    {
        "subject_id": 202,
        "time": datetime(2024, 5, 13),
        "code": "NOTE/FollowUp",
        "numeric_value": np.nan,
        "text_value": "PSA levels are undetectable post-op.",
    },
    {
        "subject_id": 202,
        "time": datetime(2024, 5, 13),
        "code": "DISCHARGE/Outpatient",
        "numeric_value": np.nan,
        "text_value": np.nan,
    },
    {
        "subject_id": 202,
        "time": datetime(2025, 5, 13),
        "code": "DEATH",
        "numeric_value": np.nan,
        "text_value": np.nan,
    },
]

patient_events_df = pd.DataFrame(patient_events_list)
patient_events_df["time"] = pd.to_datetime(patient_events_df["time"])
patient_events_df["subject_id"] = patient_events_df["subject_id"].astype(str)

# Subject Splits DataFrame
subject_splits_list = [
    {"subject_id": 101, "split": "train"},
    {
        "subject_id": 202,
        "split": "held_out",
    },  # 'held_out' is often used for the final test set
]
subject_splits_df = pd.DataFrame(subject_splits_list)

In [3]:
patient_events_df

Unnamed: 0,subject_id,time,code,numeric_value,text_value
0,101,NaT,GENDER/Female,,Female
1,101,NaT,GENETIC/BRCA1_pos,1.0,Positive
2,101,2024-01-08,ADMISSION/Outpatient,,
3,101,2024-01-08,SYMPTOM/Cough,,Persistent for 2 months
4,101,2024-01-08,LOINC/6690-2,12.5,
5,101,2024-01-08,CPT/71250,,Nodule found in right lung
6,101,2024-01-08,CPT/32408,,
7,101,2024-01-08,ICD10CM/C34.90,,Primary Diagnosis
8,101,2024-01-08,DISCHARGE/Outpatient,,
9,101,2024-01-22,ADMISSION/Inpatient,,


## Conversion to DTC format

In [4]:
# Here we set a demo mapping for the event_category column - if not provided it uses a default
# This is useful especially for cases when generating custom training data for LLMs
demo_mapping = {
    "SYMPTOM/Cough": "symptom",
    "ICD10CM/C34.90": "diagnosis",
    "DEATH": "death",
    "RX/Cisplatin": "lot",
}

In [5]:
#: Do actual conversion
df_converted_constant, df_converted_constant_description, df_converted_events = convert_meds_to_dtc(
    df_codes=code_metadata_df,
    df_data=patient_events_df,
    df_split=subject_splits_df,
    prefer_text_value_over_numeric=True,
    event_category_mapping=demo_mapping,
    no_value_default="observed",
)

In [None]:
# Get for future use
constant_columns = df_converted_constant.columns.tolist()
constant_columns = [x for x in constant_columns if x not in ["patientid"]]

## Example usage in `digital_twin_converter` package

Here we're showing an example for inference (i.e. using a pretrained model), but check out the other examples if you need to e.g. generate training data.

In [None]:
# Set basics
indication = "meds_demo"
config = Config()  # Override values here to customize pipeline
config.constant_columns_to_use = constant_columns
config.constant_birthdate_column = None  # Not using in demo
config.lot_name_col = None  # Setting for LoTs
config.event_value_lot_start = None

In [None]:
# Setup basics
dm = DataManager(config=config)

dm.load_indication_data(
    df_events=df_converted_events,
    df_constant=df_converted_constant,
    df_constant_description=df_converted_constant_description,
)
dm.process_indication_data()
dm.setup_unique_mapping_of_events()
dm.setup_dataset_splits()

data_splitter_events = DataSplitterEvents(dm, config=config)
data_splitter_events.setup_variables()
converter = ConverterInstruction(
    dm.data_frames["constant_description"],
    nr_tokens_budget_total=8192,
    config=config,
    dm=dm,
)

In [11]:
# Set example patient
patientid = 101

# Get data
patient_data = dm.get_patient_data(patientid)
patient_data["events"] = patient_data["events"].sort_values("date")


# Here then split date
split_date = patient_data["events"]["date"].iloc[-1]


# Generate splits to predict whether death will occur in the next 52 weeks
events_splits = data_splitter_events.get_splits_from_patient(
    patient_data,
    max_nr_samples_per_split=1,
    override_split_dates=[split_date],
    override_category="death",
    override_observation_time_delta=pd.Timedelta(weeks=52),
)
events_split = events_splits[0][0]

#: no forecasting split
forecast_split = None
forecasting_times_to_predict = None


# Convert to instruction
converted = converter.forward_conversion_inference(
    forecasting_split=forecast_split,
    forecasting_future_weeks_per_variable=forecasting_times_to_predict,
    event_split=events_split,
    custom_tasks=None,
)

print(converted["instruction"])

The following is a patient, starting with the demographic data, following visit by visit everything that the patient experienced. All lab codes refer to LOINC codes.

Starting with demographic data:
	Female sex is Female,
	BRCA1 gene mutation is Positive,
	No description available is train.

On the first visit, the patient experienced the following: 
	Malignant neoplasm of unspecified part of unspecified bronchus or lung is primary diagnosis,
	Admission for an outpatient clinic visit is observed,
	Procedure code for a core needle biopsy of the lung or mediastinum is observed,
	Procedure code for a CT scan of the thorax without contrast is nodule found in right lung,
	Discharge from an outpatient clinic visit is observed,
	Leukocytes [#/volume] in Blood by Automated count (White Blood Cell Count) is 12.5,
	Patient reports a persistent cough is persistent for 2 months.

2 weeks later, the patient visited and experienced the following: 
	Admission to the hospital for an inpatient stay is 