In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import os

dir_name = "PatientTrajectoryForecasting"
os.chdir(dir_name) if os.path.isdir(dir_name) else None

In [15]:
from utils.utils import (
    load_data,
    get_paths,
    store_files,
)
from utils.train import train_test_val_split, get_optimal_embedding_size
from utils.data_processing import format_data, prepare_sequences, filter_codes
from dataclasses import dataclass
import yaml

In [7]:
with open('paths.yaml', 'r') as file:
    path_config = yaml.safe_load(file)

In [8]:
@dataclass
class Config:
    strategy = 'SDP'
    predict_procedure : bool = False
    predict_drugs : bool = False
    procedure : bool = not(predict_procedure)
    drugs : bool = not(predict_drugs)
    truncate : bool = True
    pad : bool = True
    input_max_length :int = 448
    target_max_length :int = 64
    test_size : float = 0.05
    valid_size : float = 0.05
    
config = Config()

# This first part is for preparing the data

In [9]:
train_data_path = get_paths(path_config, config.strategy, config.predict_procedure, config.predict_procedure, train = True)

In [10]:
train_data_path

{'train_data_path': 'PatientTrajectoryForecasting/outputData/SDP/Inp_d_p_dr_out_d/'}

In [27]:
tokens_to_ids_map

{'PAD': 0,
 'BOH': 1,
 'BOS': 2,
 'BOV': 3,
 'EOV': 4,
 'EOH': 5,
 'D9_660': 6,
 'D9_122': 7,
 'D9_151': 8,
 'D9_159': 9,
 'D9_52': 10,
 'D9_55': 11,
 'D9_130': 12,
 'D9_155': 13,
 'D9_59': 14,
 'D9_651': 15,
 'P9_88': 16,
 'DR_338004304.0': 17,
 'DR_409176230.0': 18,
 'DR_409125830.0': 19,
 'DR_904150061.0': 20,
 'DR_409198530.0': 21,
 'DR_338004904.0': 22,
 'DR_781305714.0': 23,
 'DR_135019502.0': 24,
 'DR_904054460.0': 25,
 'DR_182050789.0': 26,
 'DR_409672924.0': 27,
 'DR_904224461.0': 28,
 'DR_904516561.0': 29,
 'DR_406055262.0': 30,
 'DR_904053061.0': 31,
 'DR_338004903.0': 32,
 'DR_172531210.0': 33,
 'DR_182844789.0': 34,
 'DR_172531110.0': 35,
 'D9_2': 36,
 'D9_663': 37,
 'DR_338500341.0': 38,
 'DR_409779362.0': 39,
 'DR_574200202.0': 40,
 'DR_338008504.0': 41,
 'DR_517760425.0': 42,
 'DR_409915801.0': 43,
 'D9_62': 44,
 'D9_205': 45,
 'D9_95': 46,
 'D9_238': 47,
 'D9_144': 48,
 'D9_4': 49,
 'D9_212': 50,
 'D9_259': 51,
 'D9_35': 52,
 'D9_209': 53,
 'P9_9': 54,
 'P9_172': 55,
 

In [28]:
patients_visits_sequences, tokens_to_ids_map, ids_to_types_map  = load_data(train = False)

In [29]:
source_target_sequences = format_data(patients_visits_sequences, config.strategy)

In [30]:
source_target_sequences, _ = filter_codes(source_target_sequences, ids_to_types_map, config.procedure, config.drugs,\
                                         reset_target_map = False )


Removing drug and procedure codes from target sequences


In [None]:
store_files(source_target_sequences = source_target_sequences,
            ids_to_types_map = ids_to_types_map, 
            tokens_to_ids_map = tokens_to_ids_map,
            ids_to_tokens_map = ids_to_tokens_map, 
            output_file = train_data_path['train_data_path'],
            train = True)

# Load data

## Next time start from here if using the same codes & strategy:

In [None]:
ls outputData/SDP/

In [None]:
train_data_path = get_paths(path_config, config.strategy, config.predict_procedure, config.predict_procedure, train = True)

In [None]:
source_target_sequences, ids_to_types_map, tokens_to_ids_map, ids_to_tokens_map = load_data(train_data_path['train_data_path'], train = True)

In [None]:
source_sequences, target_sequences = prepare_sequences(source_target_sequences, tokens_to_ids_map, config.truncate , config.pad,\
                            config.input_max_length, config.target_max_length)

In [None]:
data_properties = get_optimal_embedding_size(source_sequences, target_sequences)

In [None]:
train_data_path = get_paths(path_config, config.strategy, config.predict_procedure, config.predict_procedure, train = True, processed_data = True)

In [None]:
store_files(**data_properties, processed_data = True, output_file=train_data_path['processed_data_path'])

In [None]:
source_sequences, target_sequences, _ , new_to_old_ids_target = load_data(train_data_path['processed_data_path'], processed_data = True)

In [None]:
test_frac = 0.05
valid_frac = 0.05

In [None]:
train, test, val = train_test_val_split(source_sequences, target_sequences, test_size = 0.1, valid_size = 0.1)

# Model 

In [None]:
ids_to_tokens_map[2]

In [None]:
unique_source