# Preprocess

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os, time
import vitaldb

input_path = 'vitaldb'

# Load clinical information table and labs table from api.vitaldb
#vitaldb_info = pd.read_csv('https://api.vitaldb.net/cases')
vitaldb_info = pd.read_csv(f'{input_path}/vitaldb_info_procedure_mapped.csv')
vitaldb_labs = pd.read_csv('https://api.vitaldb.net/labs')

# Drop duplicates rows of labs table
vitaldb_labs.drop_duplicates(inplace=True)

# Load mapped parameters of vitaldb dataset
vitaldb_params = pd.read_csv(f'{input_path}/mapped/parameters_vitaldb_mapped.csv')
vitaldb_params.dropna(subset='concept_id', inplace=True)

# Map labs table with vitaldb_params table
vitaldb_labs = vitaldb_labs.merge(vitaldb_params[['Label', 'concept_id', 'Unit', 'unit_concept_id']], left_on='name', right_on='Label', how='left')
vitaldb_labs.drop(columns='Label', inplace=True)
vitaldb_labs = vitaldb_labs.astype({'concept_id': 'Int64', 'unit_concept_id': 'Int64'})
#vitaldb_labs.to_csv(f'{mapped_path}/vitaldb_labs.csv', index=False)

In [2]:
# INSPIRE operations table
inspire_path = 'inspire_v2/mapped/operations.csv'
df_op = pd.read_csv(inspire_path)

## Extract caseid that are not included in INSPIRE
vitaldb_info = vitaldb_info[vitaldb_info['caseid'].isin(df_op['case_id'].unique())]

# caseid의 입원 구별여부
distinct_caseid = False

if not distinct_caseid:
    # Select one caseid for subjectid with multiple caseids
    vitaldb_info = vitaldb_info.drop_duplicates(subset=['subjectid'], keep='first')

else:
    ## Add lab info to info table
    # Extract all labs(hct, na) records for each caseid
    def join_str(x):
        return ','.join(x.astype(str))
    #hcts = vitaldb_labs[vitaldb_labs['name'] == 'hct'].groupby('caseid')['result'].apply(','.join).reset_index()
    hcts = vitaldb_labs[vitaldb_labs['name'] == 'hct'].groupby('caseid').agg({'result': join_str, 'dt': join_str}).reset_index()
    hcts.rename(columns={'result': 'hcts', 'dt':'hct_dt'}, inplace=True)

    #nas = vitaldb_labs[vitaldb_labs['name'] == 'na'].groupby('caseid')['result'].apply(','.join).reset_index()
    nas = vitaldb_labs[vitaldb_labs['name'] == 'na'].groupby('caseid').agg({'result': join_str, 'dt': join_str}).reset_index()
    nas.rename(columns={'result': 'nas', 'dt': 'nas_dt'}, inplace=True)

    # merge hct values
    vitaldb_info = vitaldb_info.merge(hcts, on='caseid', how='left')
    vitaldb_info['hcts_cut'] = vitaldb_info['hcts'].str[:15]
    vitaldb_info = vitaldb_info.merge(nas, on='caseid', how='left')
    vitaldb_info['nas_cut'] = vitaldb_info['nas'].str[:15]

## Assign hadm_id
# subjectid that have more than one caseid
subjects = vitaldb_info.groupby('subjectid').agg({'caseid': 'count'})
unique_subjects = subjects[subjects['caseid'] > 1].index

# Assign a unique value for hadm_id which subjectid only have one caseid (하나의 caseid만 있는 subjectid의 hamd_id 부여)
mask = ~vitaldb_info['subjectid'].isin(unique_subjects)
vitaldb_info.loc[mask, 'hadm_id'] = np.arange(mask.sum()) + 1

# Assign a base_time to discriminate the order of caseids in same subjectid
vitaldb_info.loc[mask, 'base_time'] = 0

# Cases with no hadm_id
#unmatch_cases = vitaldb_info[vitaldb_info['hadm_id'].isna()]

vitaldb_info = vitaldb_info.astype({'hadm_id': 'Int64', 'base_time': 'Int64'})

In [3]:
# Extract min, max value of dt for each caseid
df_dts = vitaldb_labs.groupby('caseid').agg({'dt':[min, max]}).reset_index(drop=False)
# Flatten the MultiIndex for columns
df_dts.columns = [' '.join(col).strip() for col in df_dts.columns.values]

vitaldb_info = vitaldb_info.merge(df_dts, on='caseid', how='left') 

vitaldb_info['max_time'] = vitaldb_info[['dis', 'dt max']].max(axis=1)
vitaldb_info['min_time'] = vitaldb_info[['adm', 'dt min']].min(axis=1)

#vitaldb_info.to_csv('vitaldb/mapped/vitaldb_info+adm.csv', index=False)

In [4]:
# Map intraoperative drugs from clinical information table
info_params = vitaldb_params[vitaldb_params['vocab'].str.contains('RxNorm')]
vitaldb_drugs = pd.melt(vitaldb_info, id_vars=['caseid', 'hadm_id', 'subjectid', 'opstart', 'opend'], value_vars= info_params['Label'])
vitaldb_drugs['chart_time'] = (vitaldb_drugs['opstart'] + vitaldb_drugs['opend']) / 2
vitaldb_drugs = vitaldb_drugs.merge(info_params[['Label', 'concept_id', 'Unit', 'unit_concept_id', 'base_time']], left_on='variable', right_on='Label', how='left')

# Drop rows of missing value or zero value
vitaldb_drugs = vitaldb_drugs.dropna(subset='value')
vitaldb_drugs = vitaldb_drugs[vitaldb_drugs['value']!=0]

In [5]:
vitaldb_meas = pd.read_parquet('vitaldb/mapped/vitaldb_measurements.parquet')
info_drugs = pd.read_parquet('vitaldb/mapped/vitaldb_drugs.parquet')

In [11]:
len(vitaldb_meas), len(info_drugs)

(315761528, 16498)

## Vital Files to measurement table

In [421]:
mapped_path = 'vitaldb/mapped'

# Load manually mapped parameters
params = pd.read_csv(f'{mapped_path}/vitals_vitaldb_mapped.csv')
params.dropna(subset='concept_id', inplace=True)

# Get track names and drop duplicated rows
params['tname'] = params['Parameter'].str.split('/').str[1]
params.drop_duplicates(subset='tname', inplace=True)
params.reset_index(drop=True, inplace=True)

# Extract parameters of measurements
param_meas = params[params['vocabulary'].isin(['LOINC', 'SNOMED'])].copy()
# Extract intraop drugs from vital files
param_drug = params[params['vocabulary'].str.contains('RxNorm')].copy()

# Track names to extract
m_trks = param_meas['tname'].values
d_trks = param_drug['tname'].values

In [259]:
from tqdm import tqdm

vital_path = 'vitaldb/vitals'

# Define measurement table for vitaldb
vitaldb_meas = pd.DataFrame(columns=['caseid'])

# Get a list of all caseids
caseids = vitaldb_info['caseid'].tolist()

for caseid in tqdm(caseids):
    ipath = f'{vital_path}/{caseid}.parquet'

    # Load and extract only neccessary tracks
    vf = pd.read_parquet(ipath)
    vf = vf[vf['tname'].isin(m_trks)]

    # Merge the standard concept of tracks
    vf = vf.merge(param_meas[['tname', 'Parameter', 'concept_id', 'Unit', 'unit_concept_id']], on='tname', how='left')
    vf['caseid'] = caseid

    vf.drop(columns='wval', inplace=True)
    vf = vf.astype({'concept_id': 'Int64', 'unit_concept_id': 'Int64'})

    vitaldb_meas = pd.concat([vitaldb_meas, vf], axis=0)

# Drop columns with no valid value
vitaldb_meas.dropna(subset=['nval'], inplace=True)
vitaldb_meas.reset_index(drop=True, inplace=True)

# Unify field names with labs table
vitaldb_meas = vitaldb_meas.drop(columns='Parameter')
vitaldb_meas.rename(columns={'tname': 'name', 'nval': 'result'}, inplace=True)

vitaldb_meas.to_csv('vitaldb/mapped/vitaldb_measurements.csv', index=False)
vitaldb_meas.to_parquet('vitaldb/mapped/vitaldb_measurements.parquet')

100%|██████████| 2496/2496 [4:20:34<00:00,  6.26s/it]  


In [266]:
# Multiprocessing : vitals to measurements
from concurrent.futures import ProcessPoolExecutor

# Function to process each caseid
def process_case(caseid):
    ipath = os.path.join(vital_path, f'{caseid}.parquet')

    # Load and extract only necessary tracks
    vf = pd.read_parquet(ipath)
    vf = vf[vf['tname'].isin(m_trks)]
                             
    # Merge the standard concept of tracks
    vf = vf.merge(param_meas[['tname', 'Parameter', 'concept_id', 'Unit', 'unit_concept_id']], on='tname', how='left')
    vf['caseid'] = caseid

    vf.drop(columns='wval', inplace=True)
    vf = vf.astype({'concept_id': 'Int64', 'unit_concept_id': 'Int64'})

    return vf

# Define measurement table for vitaldb
vm = pd.DataFrame(columns=['caseid'])

# Get a list of all caseids
caseids = vitaldb_info['caseid'].tolist()

# Use ProcessPoolExecutor to parallelize the operation
with ProcessPoolExecutor() as executor:
    # Use tqdm for progress bar
    results = list(tqdm(executor.map(process_case, caseids), total=len(caseids)))

# Concatenate all DataFrames from the results
vitaldb_meas = pd.concat(results, axis=0)
    
# Drop columns with no valid value
vitaldb_meas.dropna(subset=['nval'], inplace=True)
vitaldb_meas.reset_index(drop=True, inplace=True)

# Unify field names with labs table
vitaldb_meas = vitaldb_meas.drop(columns='Parameter')
vitaldb_meas.rename(columns={'tname': 'name', 'nval': 'result'}, inplace=True)

vitaldb_meas.to_csv('vitaldb/mapped/vitaldb_measurements.csv', index=False)
vitaldb_meas.to_parquet('vitaldb/mapped/vitaldb_measurements.parquet')

100%|██████████| 3/3 [00:01<00:00,  2.96it/s]


In [262]:
vitaldb_meas.info()

<class 'pandas.core.frame.DataFrame'>
Index: 315761528 entries, 0 to 118738
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   caseid           object 
 1   tname            object 
 2   dt               float64
 3   nval             float32
 4   Parameter        object 
 5   concept_id       Int64  
 6   Unit             object 
 7   unit_concept_id  Int64  
dtypes: Int64(2), float32(1), float64(1), object(4)
memory usage: 20.6+ GB


## Vital files to drugs table

In [440]:
from tqdm import tqdm

vital_path = 'vitaldb/vitals'

# Define measurement table for vitaldb
vitaldb_drugs = pd.DataFrame(columns=['caseid'])

# Get a list of all caseids
caseids = vitaldb_info['caseid'].tolist()

for caseid in tqdm(caseids):
    ipath = f'{vital_path}/{caseid}.parquet'

    # Load and extract only neccessary tracks
    vf = pd.read_parquet(ipath)
    vf = vf[vf['tname'].isin(d_trks)]

    if len(vf) == 0:
        continue
        
    # Get average values for drug administration dose
    vf = vf.groupby('tname').agg({'nval': 'mean', 'dt': ['min', 'max']}).reset_index(drop=False)    
    vf.columns =  [' '.join(col).strip() for col in vf.columns.values]
    
    # Drop columns with no valid value
    vf = vf[vf['nval mean']!=0]
    
    # Merge the standard concept of tracks
    vf = vf.merge(param_drug[['tname', 'Parameter', 'concept_id', 'Unit', 'unit_concept_id']], on='tname', how='left')
    vf['caseid'] = caseid

    vf = vf.astype({'concept_id': 'Int64', 'unit_concept_id': 'Int64'})

    vitaldb_drugs = pd.concat([vitaldb_drugs, vf], axis=0)

vitaldb_drugs = vitaldb_drugs[vitaldb_drugs['nval mean']!=0]
vitaldb_drugs.reset_index(drop=True, inplace=True)

vitaldb_drugs.to_csv('vitaldb/mapped/vitaldb_drugs.csv', index=False)
vitaldb_drugs.to_parquet('vitaldb/mapped/vitaldb_drugs.parquet')

  1%|          | 15/2496 [00:05<15:53,  2.60it/s]

KeyboardInterrupt



In [442]:
# Multiprocessing : vitals to drug
from concurrent.futures import ProcessPoolExecutor

# Function to process each caseid
def process_case(caseid):
    ipath = f'{vital_path}/{caseid}.parquet'

    # Load and extract only neccessary tracks
    vf = pd.read_parquet(ipath)
    vf = vf[vf['tname'].isin(d_trks)]

    if len(vf) == 0:
        return None
        
    # Get average values for drug administration dose
    vf = vf.groupby('tname').agg({'nval': 'mean', 'dt': ['min', 'max']}).reset_index(drop=False)
    vf.columns =  [' '.join(col).strip() for col in vf.columns.values]
    
    # Drop columns with no valid value
    vf = vf[vf['nval mean']!=0]

    # Merge the standard concept of tracks
    vf = vf.merge(param_drug[['tname', 'Parameter', 'concept_id', 'Unit', 'unit_concept_id']], on='tname', how='left')
    vf['caseid'] = caseid

    vf = vf.astype({'concept_id': 'Int64', 'unit_concept_id': 'Int64'})

    return vf


# Get a list of all caseids
caseids = vitaldb_info['caseid'].tolist()

# Use ProcessPoolExecutor to parallelize the operation
with ProcessPoolExecutor() as executor:
    # Use tqdm for progress bar
    results = list(tqdm(executor.map(process_case, caseids), total=len(caseids)))

# Concatenate all DataFrames from the results
vitaldb_drugs = pd.concat(results, axis=0)

vitaldb_drugs = vitaldb_drugs[vitaldb_drugs['nval mean']!=0]
vitaldb_drugs.reset_index(drop=True, inplace=True)

vitaldb_drugs.to_csv('vitaldb/mapped/vitaldb_drugs.csv', index=False)
vitaldb_drugs.to_parquet('vitaldb/mapped/vitaldb_drugs.parquet')

100%|██████████| 2496/2496 [10:51<00:00,  3.83it/s]  


In [443]:
vitaldb_drugs

Unnamed: 0,tname,nval mean,dt min,dt max,Parameter,concept_id,Unit,unit_concept_id,caseid
0,PPF20_CE,2.432182,2.1943,4393.2004,Orchestra/PPF20_CE,753626,mcg/mL,8859,3
1,PPF20_CP,2.444900,2.1943,4393.2004,Orchestra/PPF20_CP,753626,mcg/mL,8859,3
2,PPF20_CT,2.531261,2.1943,4393.2004,Orchestra/PPF20_CT,753626,mcg/mL,8859,3
3,PPF20_RATE,21.936598,2.1943,4393.1994,Orchestra/PPF20_RATE,753626,mL/hr,44777613,3
4,PPF20_VOL,15.679538,2.1943,4393.2004,Orchestra/PPF20_VOL,753626,mL,8587,3
...,...,...,...,...,...,...,...,...,...
2,RFTN20_CE,0.713624,2.6933,8254.5683,Orchestra/RFTN20_CE,19016749,ng/mL,8842,6388
3,RFTN20_CP,0.726831,2.6933,8254.5683,Orchestra/RFTN20_CP,19016749,ng/mL,8842,6388
4,RFTN20_CT,0.372694,2.6933,8254.5683,Orchestra/RFTN20_CT,19016749,ng/mL,8842,6388
5,RFTN20_RATE,5.334596,2.6933,8254.5683,Orchestra/RFTN20_RATE,19016749,mL/hr,44777613,6388


# Load source tables

In [None]:
mapped_path = 'vitaldb/mapped'
vitaldb_info = pd.read_csv(f'{mapped_path}/vitaldb_info+adm.csv')
vitaldb_labs = pd.read_csv(f'{mapped_path}/vitaldb_labs.csv')
vitaldb_meas = pd.read_parquet(f'{mapped_path}/vitaldb_measurements.parquet')

### vital file eda

In [12]:
f_vital = pd.read_parquet('vitaldb/vitals/1.parquet')
f_vital = f_vital[f_vital['tname'].isin(vitaldb_meas['tname'])]
f_vital

Unnamed: 0,tname,dt,nval,wval
0,BIS,0.156,0.0,
1,BIS,1.156,0.0,
2,BIS,2.156,0.0,
3,BIS,3.156,0.0,
4,BIS,4.156,0.0,
...,...,...,...,...
381475,VENT_TV,10906.800,458.0,
381476,VENT_TV,10908.800,458.0,
381477,VENT_TV,10910.800,458.0,
381478,VENT_TV,10914.400,53.0,


In [None]:
f_vital.merge(vitaldb_meas[['Paramter', 'concept_id', 'Unit', 'unit_concept_id']], on= , how='left')

In [3]:
vitaldb.vital_recs(1, list(vitaldb_meas['Parameter']), return_pandas=True)

ZeroDivisionError: division by zero

In [3]:
vitaldb.VitalFile(1)

AttributeError: 'VitalFile' object has no attribute 'header_only'

In [35]:
vf = vitaldb.VitalFile(ipath=2, track_names=list(vitaldb_meas['Parameter']), header_only=False, skip_records=True)
vf

AttributeError: 'VitalFile' object has no attribute 'track_names'

## vitaldb hadm_id - manually mapping

In [83]:
# Load INSPIRE operation table
input_path = 'inspire_v2/mapped'
df_op = pd.read_csv(f'{input_path}/operations.csv')

# Get caseids that overlap with Vitaldb
df_op = df_op.rename(columns={'case_id': 'caseid'})
df_op.drop_duplicates(subset=['caseid'], inplace=True)
df_op.dropna(subset='caseid', inplace=True)
df_op = df_op[(df_op['caseid']>0) & (df_op['caseid']<6389)]
#df_op.groupby('caseid').apply(lambda x: len(x['hadm_id'].unique()))

In [85]:
# Merge INSPIRE adm info to Vitaldb for overlapping caseids
vital_adm = vitaldb_info.merge(df_op[['caseid', 'hadm_id', 'admission_time']], on='caseid', how='left')

# subjectid that have more than one caseid
subjects = vital_adm.groupby('subjectid').agg({'caseid': 'count'})
unique_subjects = subjects[subjects['caseid'] > 1].index

# Assign a unique identifier for hadm_id which subjectid only have one caseid (하나의 caseid만 있는 subjectid의 hamd_id 부여)
#vital_adm.loc[~vtial_adm['subjectid'].isin(unique_subjects), 'hadm_id'] = vital_adm.loc[~vital_adm['subjectid'].isin(unique_subjects), 'hadm_id'].apply(lambda x: next(unique_ids) if pd.isna(x) else x)
unique_ids = iter(range(1, 6000)) # result: 1~3447
vital_adm['hadm_id'] = vital_adm.apply(lambda x: (next(unique_ids) if x['subjectid'] not in unique_subjects else np.nan) if pd.isna(x['hadm_id']) else x['hadm_id'], axis=1)

# length of stay
vital_adm['los'] = vital_adm['dis'] - vital_adm['adm']

# Change the lab results to str type
vitaldb_labs.sort_values(by=['dt', 'caseid'], inplace=True)
vitaldb_labs['result'] = vitaldb_labs['result'].astype(str)

# Add total Lab results for each caseid
hcts = vitaldb_labs[vitaldb_labs['name'] == 'hct'].groupby('caseid')['result'].apply(','.join).reset_index()
hcts.rename(columns={'result': 'hcts'}, inplace=True)

nas = vitaldb_labs[vitaldb_labs['name'] == 'na'].groupby('caseid')['result'].apply(','.join).reset_index()
nas.rename(columns={'result': 'nas'}, inplace=True)

ks = vitaldb_labs[vitaldb_labs['name'] == 'k'].groupby('caseid')['result'].apply(','.join).reset_index()
ks.rename(columns={'result': 'ks'}, inplace=True)

# merge the lab results: hct, na, k
vital_adm = vital_adm.merge(hcts, on='caseid', how='left')
vital_adm = vital_adm.merge(nas, on='caseid', how='left')
vital_adm = vital_adm.merge(ks, on='caseid', how='left')

In [92]:
vital_adm[['caseid', 'subjectid', 'age', 'adm', 'los', 'hcts', 'nas', 'ks', 'hadm_id']].sort_values(by='subjectid').to_csv('vitaldb_hadmid.csv', index=False)

In [147]:
vital_adm['hadm_id'].isna().sum()

346

In [110]:
# sorting 
df1 = pd.DataFrame({'val': numbers})
df1 = df1.sort_values(by='val')
df1.reset_index(drop=False, inplace=True)
10 - df1.sort_values(by='index').index

Index([6, 4, 10, 9, 8, 1, 2, 5, 7, 3], dtype='int64')

In [86]:
vital_adm

Unnamed: 0,caseid,subjectid,casestart,caseend,anestart,aneend,opstart,opend,adm,dis,...,pred_p,conf,pred_o,pred_a,code_pred,hadm_id,los,hcts,nas,ks
0,1,5955,0,11542,-552,10848.0,1668,10368,-236220,627780,...,97,0.823296,Q,0,09Q70,254691603.0,864000,"35.0,39.0,37.0,38.9,37.5,35.0,32.7","138.0,141.0,141.0,134.0,132.0,132.0,141.0,138....","3.1,3.1,3.5,2.8,3.1,2.9,2.8,3.4,3.3,3.7,3.5"
1,2,2487,0,15741,-1039,14921.0,1721,14621,-221160,1506840,...,D6,0.999843,J,0,0DJ60,1.0,1728000,"32.5,26.7,24.7,22.4,29.2,30.8,33.4,33.5,34.0,3...","143.0,138.0,137.0,137.0,141.0,140.0,139.0","4.7,4.0,4.1,3.7,4.1,4.2,4.1"
2,3,2861,0,4394,-590,4210.0,1090,3010,-218640,40560,...,TB,0.999781,T,0,0TTB0,256459137.0,259200,,,
3,4,1903,0,20990,-778,20222.0,2522,17822,-201120,576480,...,D6,0.999843,J,0,0DJ60,2.0,777600,"33.0,32.0,30.0,33.0,37.0,36.0,34.0,37.0,32.7,3...","144.0,135.0,135.0,137.0,140.0,139.0,138.0,138....","4.3,3.5,3.6,3.5,3.6,3.7,3.7,3.6,3.8,3.6,3.8,3.9"
4,5,4416,0,21531,-1009,22391.0,2591,20291,-67560,3734040,...,S9,0.945371,Q,0,0SQ90,203326242.0,3801600,"45.2,38.0,28.0,34.0,33.0,31.0,38.1,36.3,34.0,3...","139.0,136.0,137.0,136.0,135.0,137.0,142.0,142....","4.9,3.8,3.7,4.0,4.7,4.5,5.0,4.5,4.0,4.7,4.5,4...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6383,6384,5583,0,15248,-260,15640.0,2140,14140,-215340,648660,...,D6,0.999843,J,0,0DJ60,3446.0,864000,"38.0,36.6,38.8,37.1","143.0,137.0,139.0,140.0,140.0","4.2,4.0,5.1,4.3,4.3"
6384,6385,2278,0,20643,-544,20996.0,2396,19496,-225600,1675200,...,97,0.823368,Q,0,09Q70,267730479.0,1900800,"44.2,39.0,39.0,37.0,37.9,38.8,38.4,36.8,39.7,4...","143.0,144.0,134.0,135.0,133.0,144.0,137.0,136....","3.6,4.0,3.5,3.7,3.1,4.2,3.9,5.0,5.0,5.2,4.7,4...."
6385,6386,4045,0,19451,-667,19133.0,3533,18233,-200460,836340,...,D6,0.999843,J,0,0DJ60,245189449.0,1036800,"30.0,31.0,30.9,27.4,35.1","142.0,136.0,136.0,138.0,139.0","4.5,3.7,3.7,4.3,4.3"
6386,6387,5230,0,12025,-550,12830.0,1730,11030,-227760,377040,...,TB,0.999781,T,0,0TTB0,3447.0,604800,"39.2,33.0,38.2,39.3,34.3","142.0,136.0,136.0,140.0,138.0","3.6,3.5,4.4,4.0,3.5"


In [93]:
vital_adm[vital_adm['subjectid']==1584].to_csv('examp.csv', index=False)

## vitaldb align

In [10]:
df = pd.read_csv('https://api.vitaldb.net/cases')

# only include subjects with more than one cases
subjects = df.groupby('subjectid').agg({'caseid': 'count'})
unique_subjects = subjects[subjects['caseid'] > 1].index
vitaldb_labs[vitaldb_labs['subjectid'].isin(unique_subjects)]

KeyError: 'subjectid'

In [12]:
hb_comp = vitaldb_labs[vitaldb_labs['name'] == 'hb'].groupby('caseid').agg({'result': set, 'dt': set}).reset_index()
hb_comp

Unnamed: 0,caseid,result,dt
0,1,"{13.4, 11.5, 12.2, 12.9}","{12609, 594465, 137867, 399857}"
1,2,"{7.8, 8.8, 9.5, 10.2, 7.3, 11.0, 11.1, 11.3, 1...","{5776355, 240548, 152325, 6203997, -101973, 58..."
2,4,"{9.9, 10.5, 10.1, 10.4, 10.8, 11.5, 11.9}","{347297, 434406, 606791, 519791, 173521, 21558..."
3,5,"{8.4, 9.8, 10.0, 10.1, 9.9, 10.5, 10.7, 10.8, ...","{562571, 192020, 388886, 95642, 606366, 267055..."
4,6,"{11.5, 12.1, 12.3}","{-225295, -71044, -148191}"
...,...,...,...
5366,6384,"{12.7, 13.0, 12.0, 12.2}","{590496, 415779, 345779, 160517}"
5367,6385,"{12.6, 13.0, 12.8, 12.9, 13.1, 13.2, 13.3, 15.2}","{1273056, 415906, 841186, -154429, 929320, 583..."
5368,6386,"{8.9, 9.7, 11.1}","{263888, 433672, 174389}"
5369,6387,"{11.5, 12.3, 12.5}","{406177, -148636, 231053, 58966}"


In [27]:
vitaldb_labs[vitaldb_labs['name'] == 'hb'].groupby('caseid')[['result', 'dt']].apply(set).reset_index()

Unnamed: 0,caseid,result
0,1,"{12.9, 13.4, 11.5, 12.2}"
1,2,"{11.1, 10.2, 11.4, 9.5, 7.3, 11.9, 8.8, 11.3, ..."
2,4,"{10.5, 10.8, 9.9, 11.9, 10.1, 11.5, 10.4}"
3,5,"{15.3, 10.5, 13.1, 10.8, 9.9, 12.6, 10.9, 8.4,..."
4,6,"{12.3, 12.1, 11.5}"
...,...,...
5366,6384,"{13.0, 12.7, 12.2, 12.0}"
5367,6385,"{13.3, 12.8, 13.1, 12.6, 13.2, 15.2, 12.9, 13.0}"
5368,6386,"{8.9, 9.7, 11.1}"
5369,6387,"{12.5, 12.3, 11.5}"


In [4]:
import pandas as pd

# extract hb values for each case
vitaldb_labs = pd.read_csv('https://api.vitaldb.net/labs')
# df = vitaldb_labs[['caseid','name']].groupby('name').agg({'caseid': 'count'}).reset_index().sort_values('caseid', ascending=False)
# print(df.head())
# quit()

vitaldb_labs.sort_values(by=['dt', 'caseid'], inplace=True)
vitaldb_labs['result'] = vitaldb_labs['result'].astype(str)

hbs = vitaldb_labs[vitaldb_labs['name'] == 'hb'].groupby('caseid')['result'].apply(','.join).reset_index()
hbs.rename(columns={'result': 'hbs'}, inplace=True)

nas = vitaldb_labs[vitaldb_labs['name'] == 'na'].groupby('caseid')['result'].apply(','.join).reset_index()
nas.rename(columns={'result': 'nas'}, inplace=True)

df = pd.read_csv('https://api.vitaldb.net/cases')
df['los'] = (df['dis'] - df['adm']) // 86400

# only include subjects with more than one cases
subjects = df.groupby('subjectid').agg({'caseid': 'count'})
unique_subjects = subjects[subjects['caseid'] > 1].index
df = df[df['subjectid'].isin(unique_subjects)]

# merge hb values
df = df.merge(hbs, on='caseid', how='left')
df['hbs_cut'] = df['hbs'].str[:15]
df = df.merge(nas, on='caseid', how='left')
df['nas_cut'] = df['nas'].str[:15]

# find confirmed cases
df.drop_duplicates(subset=['subjectid', 'hbs_cut'], keep=False, inplace=True)
df.drop_duplicates(subset=['subjectid', 'nas_cut'], keep=False, inplace=True)

# sort and merge
df = df.loc[:,['subjectid', 'age', 'caseid', 'adm', 'dis', 'los', 'icu_days', 'hbs', 'nas', 'department', 'death_inhosp']].sort_values(by=['death_inhosp', 'subjectid'])

In [18]:
def max_consecutive_overlap(arr1, arr2):
    n1, n2 = len(arr1), len(arr2)
    dp = [[0] * (n2 + 1) for _ in range(n1 + 1)]
    max_length = 0

    for i in range(1, n1 + 1):
        for j in range(1, n2 + 1):
            if arr1[i - 1] == arr2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
                max_length = max(max_length, dp[i][j])

    return max_length

max_consecutive_overlap([5, 4, 2, 1, 3], [2, 5, 1, 3, 7, 9, 8, 5])
max_consecutive_overlap([11.1,10.3,9.7,10.2,9.0,9.6,10.2,9.2,8.9], [11.3,11.1,10.3,9.7,10.2,9.0,9.6,10.2])

7

In [19]:
def find_overlap_index(list1, list2):
    """Find the index in list2 where list1 overlaps, or -1 if no overlap."""
    for i in range(len(list2)):
        if list2[i:i+len(list1)] == list1:
            return i
    return -1

# Example DataFrame
data = {
    'subjectid': [1, 2, 3, 4],
    'hb': [[12.1, 12.3, 12.5], [12.5, 12.7, 12.9], [12.9, 13.0], [11.3, 11.5]]
}
df = pd.DataFrame(data)

# Compare each hb list with others to estimate order
order_estimates = []
for i, hb1 in df['hb'].iteritems():
    for j, hb2 in df['hb'].iteritems():
        if i != j:
            overlap_index = find_overlap_index(hb1, hb2)
            if overlap_index != -1:
                order_estimates.append((i, j, overlap_index))

# Sort based on where the overlap was found
order_estimates.sort(key=lambda x: x[2])

# Extracting the estimated order
estimated_order = [i for i, j, idx in order_estimates]
print("Estimated order of base times:", estimated_order)

AttributeError: 'Series' object has no attribute 'iteritems'

In [None]:
df.loc[df['subjectid'].isin(unique_subjects), 'hadm_id'] = np.arange(len(unique_subjects)) + 1
df2 = df[~df['subjectid'].isin(unique_subjects)]

df2

In [15]:
df = pd.read_csv('https://api.vitaldb.net/cases')
df['los'] = (df['dis'] - df['adm']) // 86400

# only include subjects with more than one cases
subjects = df.groupby('subjectid').agg({'caseid': 'count'})
unique_subjects = subjects[subjects['caseid'] > 1].index
df = df[df['subjectid'].isin(unique_subjects)]

# merge hb values
df = df.merge(hbs, on='caseid', how='left')
df['hbs_cut'] = df['hbs'].str[:15]
df = df.merge(nas, on='caseid', how='left')
df['nas_cut'] = df['nas'].str[:15]

df

Unnamed: 0,caseid,subjectid,casestart,caseend,anestart,aneend,opstart,opend,adm,dis,...,intraop_vecu,intraop_eph,intraop_phe,intraop_epi,intraop_ca,los,hbs,hbs_cut,nas,nas_cut
0,63,496,0,6034,-99,6021.0,1221,5421,-119940,398460,...,0,0,0,0,0,6,"15.2,14.4,16.4","15.2,14.4,16.4","142.0,139.0,135.0","142.0,139.0,135"
1,77,948,0,10764,-603,10917.0,1917,9717,-126480,1342320,...,0,0,0,0,0,17,"10.7,11.9,9.0,9.0,8.7,9.0,8.6,9.1,9.2,8.9,9.5,...","10.7,11.9,9.0,9","139.0,136.0,136.0,135.0,140.0,137.0,137.0,138....","139.0,136.0,136"
2,99,4306,0,5495,-251,6109.0,2809,4609,-1874880,3827520,...,0,10,80,0,0,66,"11.1,10.3,10.5,10.3,10.6,10.5,9.4,10.6,10.1,10...","11.1,10.3,10.5,","139.0,137.0,140.0,138.0,140.0,140.0,140.0,142....","139.0,137.0,140"
3,133,88,0,10708,-93,9447.0,2247,10047,-474720,648480,...,0,15,0,0,0,13,"12.8,12.6,11.7,12.1","12.8,12.6,11.7,","139.0,138.0,138.0,133.0","139.0,138.0,138"
4,146,1414,0,16471,-1705,16235.0,4835,15635,-1381740,1555860,...,0,50,60,60,1200,34,"9.5,9.7,9.5,8.7,8.6,6.8,7.9,7.1,8.6,8.1,7.2,7....","9.5,9.7,9.5,8.7","143.0,146.0,147.0,146.0,151.0,148.0,155.0,155....","143.0,146.0,147"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530,6337,5565,0,24223,-449,25651.0,4051,23551,-1286520,1391880,...,0,0,0,0,0,31,"10.0,8.8,6.8,9.4,8.5,10.6,9.8,8.2,10.3,12.8,13...","10.0,8.8,6.8,9.","136.0,133.0,130.0,136.0,137.0,138.0,136.0,136....","136.0,133.0,130"
531,6338,1367,0,4709,-109,4991.0,1091,4391,-222360,296040,...,0,0,0,0,0,6,11.8,11.8,140.0,140.0
532,6358,5173,0,18896,-37,18863.0,3563,17963,-287100,404100,...,0,55,0,0,0,8,"8.5,6.0,11.5,10.9,9.7","8.5,6.0,11.5,10","139.0,140.0,137.0,137.0,143.0","139.0,140.0,137"
533,6364,4702,0,8717,105,9405.0,1005,8505,-199980,404820,...,0,0,0,0,0,7,10.4,10.4,141.0,141.0


In [24]:
df.drop_duplicates(subset=['subjectid', 'los'])

Unnamed: 0,caseid,subjectid,casestart,caseend,anestart,aneend,opstart,opend,adm,dis,...,intraop_vecu,intraop_eph,intraop_phe,intraop_epi,intraop_ca,los,hbs,hbs_cut,nas,nas_cut
0,63,496,0,6034,-99,6021.0,1221,5421,-119940,398460,...,0,0,0,0,0,6,"15.2,14.4,16.4","15.2,14.4,16.4","142.0,139.0,135.0","142.0,139.0,135"
1,77,948,0,10764,-603,10917.0,1917,9717,-126480,1342320,...,0,0,0,0,0,17,"10.7,11.9,9.0,9.0,8.7,9.0,8.6,9.1,9.2,8.9,9.5,...","10.7,11.9,9.0,9","139.0,136.0,136.0,135.0,140.0,137.0,137.0,138....","139.0,136.0,136"
2,99,4306,0,5495,-251,6109.0,2809,4609,-1874880,3827520,...,0,10,80,0,0,66,"11.1,10.3,10.5,10.3,10.6,10.5,9.4,10.6,10.1,10...","11.1,10.3,10.5,","139.0,137.0,140.0,138.0,140.0,140.0,140.0,142....","139.0,137.0,140"
3,133,88,0,10708,-93,9447.0,2247,10047,-474720,648480,...,0,15,0,0,0,13,"12.8,12.6,11.7,12.1","12.8,12.6,11.7,","139.0,138.0,138.0,133.0","139.0,138.0,138"
4,146,1414,0,16471,-1705,16235.0,4835,15635,-1381740,1555860,...,0,50,60,60,1200,34,"9.5,9.7,9.5,8.7,8.6,6.8,7.9,7.1,8.6,8.1,7.2,7....","9.5,9.7,9.5,8.7","143.0,146.0,147.0,146.0,151.0,148.0,155.0,155....","143.0,146.0,147"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
528,6302,5593,0,17166,-122,17098.0,2998,16198,-121500,483300,...,0,0,0,0,0,7,"12.2,11.7,11.5,10.7,12.2,12.2,12.3,12.4,12.7","12.2,11.7,11.5,","138.0,139.0,139.0,139.0,134.0,132.0,136.0,138....","138.0,139.0,139"
529,6321,5307,0,8248,-1378,8342.0,1442,7442,-141000,377400,...,0,10,0,0,0,6,"14.5,12.4,13.0","14.5,12.4,13.0","143.0,134.0,138.0","143.0,134.0,138"
531,6338,1367,0,4709,-109,4991.0,1091,4391,-222360,296040,...,0,0,0,0,0,6,11.8,11.8,140.0,140.0
533,6364,4702,0,8717,105,9405.0,1005,8505,-199980,404820,...,0,0,0,0,0,7,10.4,10.4,141.0,141.0


In [None]:
같은 los여도, 다른 입원일 수 있음. 랩 3개 겹치면 

In [19]:
df['subjectid'].value_counts()

subjectid
3131    14
1609    10
4306     8
2291     6
4748     5
        ..
5511     2
2328     2
3384     2
4561     2
4873     2
Name: count, Length: 237, dtype: int64

In [20]:
df[df['subjectid']==2291]

Unnamed: 0,caseid,subjectid,casestart,caseend,anestart,aneend,opstart,opend,adm,dis,...,intraop_vecu,intraop_eph,intraop_phe,intraop_epi,intraop_ca,los,hbs,hbs_cut,nas,nas_cut
59,912,2291,0,7660,-1160,7900.0,3400,7000,-3600780,373620,...,0,0,0,0,0,46,"11.1,10.3,9.7,10.2,9.0,9.6,10.2,9.2,8.9,9.6,9....","11.1,10.3,9.7,1","134.0,135.0,139.0,139.0,141.0,141.0,135.0,136....","134.0,135.0,139"
91,1297,2291,0,5497,-72,5328.0,1728,4728,-2374860,1599540,...,0,15,250,0,0,46,"11.1,10.3,9.7,10.2,9.0,9.6,10.2,9.2,8.9,9.6,9....","11.1,10.3,9.7,1","134.0,135.0,139.0,139.0,141.0,141.0,135.0,136....","134.0,135.0,139"
204,2600,2291,0,5488,-354,5646.0,1746,4746,-2739600,1234800,...,0,0,150,0,0,46,"11.1,10.3,9.7,10.2,9.0,9.6,10.2,9.2,8.9,9.6,9....","11.1,10.3,9.7,1","134.0,135.0,139.0,139.0,141.0,141.0,135.0,136....","134.0,135.0,139"
214,2695,2291,0,6610,-1779,6021.0,2901,5301,-2983260,991140,...,0,20,0,0,0,46,"11.1,10.3,9.7,10.2,9.0,9.6,10.2,9.2,8.9,9.6,9....","11.1,10.3,9.7,1","134.0,135.0,139.0,139.0,141.0,141.0,135.0,136....","134.0,135.0,139"
445,5311,2291,0,34876,-831,33969.0,3969,30969,-1094580,2879820,...,0,10,270,0,1200,46,"11.3,11.1,10.3,9.7,10.2,9.0,9.6,10.2,9.2,8.9,9...","11.3,11.1,10.3,","137.0,134.0,135.0,139.0,139.0,141.0,141.0,135....","137.0,134.0,135"
498,5945,2291,0,4708,-702,4698.0,1698,3798,-3345660,628740,...,0,10,150,0,0,46,"11.1,10.3,9.7,10.2,9.0,9.6,10.2,9.2,8.9,9.6,9....","11.1,10.3,9.7,1","134.0,135.0,139.0,139.0,141.0,141.0,135.0,136....","134.0,135.0,139"


In [18]:
df[df['subjectid']==10]

Unnamed: 0,caseid,subjectid,casestart,caseend,anestart,aneend,opstart,opend,adm,dis,...,intraop_vecu,intraop_eph,intraop_phe,intraop_epi,intraop_ca,los,hbs,hbs_cut,nas,nas_cut
9,264,10,0,36526,-487,36173.0,7373,35873,-1534620,3130980,...,0,8,9,89,240,54,"10.0,8.5,8.7,9.8,9.7","10.0,8.5,8.7,9.","142.0,143.0,135.0,135.0","142.0,143.0,135"
16,366,10,0,11419,-3101,11299.0,3499,10399,-4091460,574140,...,0,6,0,53,90,54,"10.0,8.5,8.7,9.8,9.7","10.0,8.5,8.7,9.","142.0,143.0,135.0,135.0","142.0,143.0,135"
249,3230,10,0,7630,-544,7676.0,1376,6776,-2212920,2452680,...,0,0,0,0,0,54,"10.0,8.5,8.7,9.8,9.7","10.0,8.5,8.7,9.","142.0,143.0,135.0,135.0","142.0,143.0,135"
270,3486,10,0,22928,-802,24098.0,1898,22327,-3281580,1384020,...,0,0,0,7,990,54,"10.0,8.5,8.7,9.8,9.7","10.0,8.5,8.7,9.","142.0,143.0,135.0,135.0","142.0,143.0,135"
461,5503,10,0,8538,-57,9243.0,1443,8043,-2056500,2609100,...,0,0,0,9,60,54,"10.0,8.5,8.7,9.8,9.7","10.0,8.5,8.7,9.","142.0,143.0,135.0,135.0","142.0,143.0,135"


In [17]:
sub = df.groupby(['subjectid', 'hbs_cut']).agg({'caseid':'count'})
sub[sub['caseid']>1]

Unnamed: 0_level_0,Unnamed: 1_level_0,caseid
subjectid,hbs_cut,Unnamed: 2_level_1
10,"10.0,8.5,8.7,9.",5
128,"12.9,12.4,10.3,",2
150,"12.4,10.8,11.5,",2
194,"12.0,10.9,9.8,1",2
229,"10.8,11.4,11.6,",2
...,...,...
5985,"9.4,11.0,9.4,10",2
5998,"15.1,15.0,14.7,",2
6033,"12.8,12.6,11.3,",2
6041,"10.3,8.3,9.2,9.",2


In [13]:
df.drop_duplicates(subset=['subjectid', 'hbs_cut'], keep=False)

Unnamed: 0,caseid,subjectid,casestart,caseend,anestart,aneend,opstart,opend,adm,dis,...,intraop_vecu,intraop_eph,intraop_phe,intraop_epi,intraop_ca,los,hbs,hbs_cut,nas,nas_cut
0,63,496,0,6034,-99,6021.0,1221,5421,-119940,398460,...,0,0,0,0,0,6,"15.2,14.4,16.4","15.2,14.4,16.4","142.0,139.0,135.0","142.0,139.0,135"
3,133,88,0,10708,-93,9447.0,2247,10047,-474720,648480,...,0,15,0,0,0,13,"12.8,12.6,11.7,12.1","12.8,12.6,11.7,","139.0,138.0,138.0,133.0","139.0,138.0,138"
7,250,1092,0,11950,-661,11939.0,1139,11339,-147660,629940,...,0,0,0,0,0,9,"13.3,13.5,13.7,12.5,11.3,11.6,12.0,12.1,12.3,1...","13.3,13.5,13.7,","141.0,137.0,140.0,142.0,138.0,135.0,136.0,136....","141.0,137.0,140"
12,295,5778,0,12473,-271,12029.0,1529,11429,-388740,648060,...,0,5,0,0,0,12,"13.5,12.2,12.5,12.1","13.5,12.2,12.5,","136.0,137.0,129.0,132.0,132.0,137.0,135.0,138.0","136.0,137.0,129"
13,311,1435,0,5984,-1055,4645.0,745,4345,-129120,216480,...,0,15,0,0,0,4,"13.3,12.5","13.3,12.5","140.0,140.0","140.0,140.0"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,6275,5660,0,19769,-2382,19878.0,2178,18378,-218580,2114220,...,0,0,0,0,1200,27,"12.1,13.3,9.1,10.0,9.6,8.7,8.3,9.0,8.9,9.6,12....","12.1,13.3,9.1,1","142.0,139.0,142.0,133.0,135.0,135.0,137.0,132....","142.0,139.0,142"
529,6321,5307,0,8248,-1378,8342.0,1442,7442,-141000,377400,...,0,10,0,0,0,6,"14.5,12.4,13.0","14.5,12.4,13.0","143.0,134.0,138.0","143.0,134.0,138"
531,6338,1367,0,4709,-109,4991.0,1091,4391,-222360,296040,...,0,0,0,0,0,6,11.8,11.8,140.0,140.0
532,6358,5173,0,18896,-37,18863.0,3563,17963,-287100,404100,...,0,55,0,0,0,8,"8.5,6.0,11.5,10.9,9.7","8.5,6.0,11.5,10","139.0,140.0,137.0,137.0,143.0","139.0,140.0,137"


In [8]:
df

Unnamed: 0,subjectid,age,caseid,adm,dis,los,icu_days,hbs,nas,department,death_inhosp
123,32,47.0,1598,-229860,288540,6,0,"12.2,11.2","140.0,141.0,140.0",General surgery,0
141,32,46.0,1845,-1159740,3592260,55,0,"8.0,12.3,14.3,12.9,12.1,12.9,11.9,10.7,11.5","137.0,135.0,133.0,134.0,135.0,131.0,126.0,130....",General surgery,0
20,35,54.0,504,-225840,292560,6,0,"12.6,12.5,12.0","142.0,141.0,142.0,141.0",General surgery,0
153,35,55.0,1947,-200340,145260,4,0,"14.2,13.5,13.2,12.4","142.0,137.0,140.0,142.0",Thoracic surgery,0
3,88,61.0,133,-474720,648480,13,0,"12.8,12.6,11.7,12.1","139.0,138.0,138.0,133.0",General surgery,0
...,...,...,...,...,...,...,...,...,...,...,...
169,6044,62.0,2121,-212820,219180,5,0,"13.0,12.2,12.5,11.5","142.0,136.0,139.0,136.0,135.0",Thoracic surgery,0
247,6055,51.0,3205,-212400,219600,5,1,"8.8,8.1,8.2,8.7,8.2,8.4,9.9,8.2,8.2,8.5,8.0,8....","132.0,136.0,134.0,139.0,132.0,134.0,135.0,131....",Thoracic surgery,0
423,6055,51.0,5040,-207060,484140,8,1,"10.1,10.4,8.6,8.3,8.3,8.3,10.3,9.8,9.6,8.5,7.9...","139.0,132.0,132.0,130.0,133.0,132.0,134.0,134....",General surgery,0
281,1123,78.0,3552,-400560,2623440,35,1,"11.1,11.9,14.0,12.5,14.4,12.1,11.3,11.1,10.9,1...","139.0,135.0,134.0,129.0,125.0,133.0,130.0,135....",General surgery,1


### EDA

In [None]:
# Load and extract only neccessary tracks
vf = pd.read_parquet(ipath)

# OMOP-CDM

In [6]:
save_path = 'VITALDB_ETL'
save_csv = True
dname = 'VITALDB'

# Make directory to save the results
if not os.path.exists(save_path):
    os.mkdir(save_path)
os.makedirs(os.path.join(save_path, 'sample'), exist_ok=True)

# PERSON

In [7]:
# start_index for each table_id
start_index = {
'person': 1000000,
'observation_period': 1000000,
'visit_occurrence': 1000000,
'visit_detail': 1000000,
'condition_occurrence': 1000000,
'drug_exposure': 1000000,
'procedure_occurrence': 1000000,
'measurement': 1000000,
'note': 1000000,
'location': 1000000 
}

In [8]:
### PERSON TABLE ###
print('PERSON TABLE...', end='')
# Create an empty dataframe for PERSON table
df_person = pd.DataFrame(columns=['PERSON_ID'])

# Assign unique IDs to each distinct 'subjectid' from the operations data
unique_ids = vitaldb_info['subjectid'].unique()
df_person['PERSON_ID'] = start_index['person'] + np.arange(len(unique_ids)) + 1
df_person['subjectid'] = unique_ids

# Merge relevant columns from the operations dataframe with the PERSON dataframe based on 'subject_id'
usecols = ['subjectid', 'age', 'sex']
df_person = df_person.merge(vitaldb_info[usecols], on = 'subjectid')
# Ensure only the latest discharge_time is retained for each unique PERSON_ID
df_person.drop_duplicates(subset = 'PERSON_ID', keep = 'first', inplace = True, ignore_index = True)

# Map gender values ('M' or 'F') to corresponding GENDER_CONCEPT_ID values
df_person['GENDER_CONCEPT_ID'] = df_person['sex'].map({'M': 8507, 'F': 8532}, na_action='ignore')
# Remove any rows with missing gender values
df_person.dropna(subset=['GENDER_CONCEPT_ID'])

# Set the first date of all patients to 2011.01.01 since the exact year is not specified
start_date = datetime(2011, 1, 1)

# Calculate and assign the year of birth based on age and the start date
df_person['YEAR_OF_BIRTH'] = start_date.year - df_person['age']
# Compute the exact birth datetime using age and start date
df_person['BIRTH_DATETIME'] = pd.to_datetime(start_date) - pd.to_timedelta(df_person['age']*365.25, unit = 'days')

# Set RACE_CONCEPT_ID to indicate all individuals are ASIAN
#df_person['RACE_CONCEPT_ID'] = 8515

# Assign value for LOCATION_ID (1: Vitaldb)
df_person['LOCATION_ID'] = 'vitaldb'

# Populate source value columns based on values from the operations data
df_person['PERSON_SOURCE_VALUE'] = df_person['subjectid']
df_person['GENDER_SOURCE_VALUE'] = df_person['sex']

# Remove columns that aren't part of the final PERSON table format
df_person.drop(columns=usecols, inplace=True)

# Write the processed data to a parquet file
df_person.to_parquet(f'{save_path}/{dname}_PERSON.parquet')
if save_csv:
    df_person.to_csv(f'{save_path}/{dname}_PERSON.csv', index=False)
df_person[:1000].to_csv(f'{save_path}/sample/{dname}_PERSON.csv', index=False)    
print('done')

PERSON TABLE...

In [29]:
df_person

Unnamed: 0,PERSON_ID,GENDER_CONCEPT_ID,YEAR_OF_BIRTH,BIRTH_DATETIME,LOCATION_ID,PERSON_SOURCE_VALUE,GENDER_SOURCE_VALUE
0,1000001,8507,1934.0,1933-12-31 18:00:00,vitaldb,5955,M
1,1000002,8507,1949.0,1948-12-31 12:00:00,vitaldb,2861,M
2,1000003,8507,1945.0,1944-12-31 12:00:00,vitaldb,4416,M
3,1000004,8532,1930.0,1929-12-31 18:00:00,vitaldb,4328,F
4,1000005,8532,1979.0,1979-01-01 00:00:00,vitaldb,2008,F
...,...,...,...,...,...,...,...
2491,1002492,8532,1941.0,1940-12-31 12:00:00,vitaldb,3096,F
2492,1002493,8507,1945.0,1944-12-31 12:00:00,vitaldb,4763,M
2493,1002494,8532,1943.0,1943-01-01 00:00:00,vitaldb,3181,F
2494,1002495,8532,1950.0,1949-12-31 18:00:00,vitaldb,4045,F


# OBSERVATION_PERIOD

In [9]:
### OBSERVATION_PERIOD ###
print('OBSERVATION_PERIOD TABLE...', end='')
# Create an empty dataframe for OBSERVATION_PERIOD table
df_obs = pd.DataFrame(columns=['OBSERVATION_PERIOD_ID'])

# Copy PERSON_ID from PERSON table to OBSERVATION_PERIOD table
df_obs['OBSERVATION_PERIOD_ID'] =  start_index['observation_period'] - start_index['person'] + df_person['PERSON_ID']
# Assign OBSERVATION_PERIOD_ID to each PERSON_ID
df_obs['PERSON_ID'] = df_person['PERSON_ID']
# Copy PERSON_SOURCE_VALUE from PERSON table to subject_id in OBSERVATION_PERIOD table for merging purposes
df_obs['subjectid'] = df_person['PERSON_SOURCE_VALUE']

# Merge min_time, max_time, base_time from information table
usecols = ['subjectid', 'min_time', 'max_time', 'base_time']
df_obs = df_obs.merge(vitaldb_info[usecols], on='subjectid', how='left')

# Define the base date for the observation period
base_date = datetime(2011, 1, 1) + pd.to_timedelta(df_obs['base_time'], unit='sec')

# Set the OBSERVATION_PERIOD_START_DATE, OBSERVATION_PERIOD_END_DATE to the earliest and latest record of subjectid 
df_obs['OBSERVATION_PERIOD_START_DATE'] = pd.to_datetime(base_date) + pd.to_timedelta(df_obs['min_time'], unit='sec')
df_obs['OBSERVATION_PERIOD_END_DATE'] = pd.to_datetime(base_date) + pd.to_timedelta(df_obs['max_time'], unit='sec')

# Assign the PERIOD_TYPE_CONCEPT_ID indicating the data source is an EHR
df_obs['PERIOD_TYPE_CONCEPT_ID'] = 32817

# Remove columns that aren't part of the final OBSERVATION_PERIOD table format
df_obs.drop(columns=usecols, inplace=True)   

# Write the processed data to a parquet file
df_obs.to_parquet(f'{save_path}/{dname}_OBSERVATION_PERIOD.parquet')
if save_csv:
    df_obs.to_csv(f'{save_path}/{dname}_OBSERVATION_PERIOD.csv', index=False)
df_obs[:1000].to_csv(f'{save_path}/sample/{dname}_OBSERVATION_PERIOD.csv', index=False)    
print('done')

OBSERVATION_PERIOD TABLE...done


In [31]:
df_obs

Unnamed: 0,OBSERVATION_PERIOD_ID,PERSON_ID,OBSERVATION_PERIOD_START_DATE,OBSERVATION_PERIOD_END_DATE,PERIOD_TYPE_CONCEPT_ID
0,1000001,1000001,2010-12-29 06:23:00,2011-01-08 06:23:00,32817
1,1000002,1000002,2010-12-29 11:16:00,2011-01-01 11:16:00,32817
2,1000003,1000003,2010-12-31 05:14:00,2011-02-13 05:14:00,32817
3,1000004,1000004,2010-12-29 10:58:00,2011-01-03 10:58:00,32817
4,1000005,1000005,2010-12-30 10:15:00,2011-01-01 10:15:00,32817
...,...,...,...,...,...
2491,1002492,1002492,2010-12-30 12:58:00,2011-01-05 12:58:00,32817
2492,1002493,1002493,2010-12-30 16:25:00,2011-01-01 16:25:00,32817
2493,1002494,1002494,2010-12-12 06:02:15,2011-01-07 17:08:39,32817
2494,1002495,1002495,2010-12-29 16:19:00,2011-01-10 16:19:00,32817


# VISIT_OCCURRENCE

In [10]:
### VISIT_OCCURRENCE TABLE ###
# Match admission records
print('VISIT_OCCURRENCE TABLE...', end='')
# Create an empty dataframe for VISIT_OCCURRENCE table
df_visit_occ = pd.DataFrame(columns=['VISIT_OCCURRENCE_ID'])

# Copy PERSON_ID, PERSON_SOURCE_VALUE from df_person to df_visit_occ
df_visit_occ['PERSON_ID'] = df_person['PERSON_ID']
df_visit_occ['subjectid'] = df_person['PERSON_SOURCE_VALUE']

# Merge admission and discharge time to VISIT_OCCURRENCE table
df_visit_occ = df_visit_occ.merge(vitaldb_info[['subjectid', 'hadm_id', 'adm', 'dis', 'base_time']], on='subjectid', how='left')

# Assign sequential IDs starting from 1 to VISIT_OCCURRENCE_ID column
df_visit_occ['VISIT_OCCURRENCE_ID'] = start_index['visit_occurrence'] + np.arange(len(df_visit_occ)) + 1

# Set VISIT_CONCEPT_ID to indicate all individuals are admitted to hospital
df_visit_occ['VISIT_CONCEPT_ID'] = 9201

# Define the base date
base_date = datetime(2011, 1, 1) + pd.to_timedelta(df_visit_occ['base_time'], unit='sec')
# Set VISIT_START_DATE to the admission time
df_visit_occ['VISIT_START_DATETIME'] = pd.to_datetime(base_date) + pd.to_timedelta(df_visit_occ['adm'], unit='sec')
df_visit_occ['VISIT_START_DATE'] = pd.to_datetime(df_visit_occ['VISIT_START_DATETIME'].dt.date)
# Set VISIT_END_DATE to the discharge time
df_visit_occ['VISIT_END_DATETIME'] = pd.to_datetime(base_date) + pd.to_timedelta(df_visit_occ['dis'], unit='sec')
df_visit_occ['VISIT_END_DATE'] = pd.to_datetime(df_visit_occ['VISIT_END_DATETIME'].dt.date)

# Assign the VISIT_TYPE_CONCEPT_ID indicating the data source is an EHR
df_visit_occ['VISIT_TYPE_CONCEPT_ID'] = 32817

## Mapping PRECEDING_VISIT_OCCURRENCE_ID
# Generate a column with the previous 'subject_id' for determining preceding visit occurrence
df_visit_occ['prev_subjectid'] = df_visit_occ['subjectid'].shift(1).astype('Int64')
# Create a new boolean column 'nadm' to check if the current row's subject_id matches the previous one
df_visit_occ['nadm'] = df_visit_occ['subjectid'] == df_visit_occ['prev_subjectid']
# Set the first row's 'nadm' value to False since there's no preceding record
df_visit_occ.at[0, 'nadm'] = False 
# Compute PRECEDING_VISIT_OCCURRENCE_ID based on 'nadm'
df_visit_occ['PRECEDING_VISIT_OCCURRENCE_ID'] = np.where(df_visit_occ['nadm'], df_visit_occ['VISIT_OCCURRENCE_ID'].shift(1), np.nan)
df_visit_occ['PRECEDING_VISIT_OCCURRENCE_ID'] = df_visit_occ['PRECEDING_VISIT_OCCURRENCE_ID'].astype('Int64')

# Remove columns that aren't part of the final VISIT_OCCURRENCE table format except for 'hadm_id'
df_visit_occ.drop(columns=['subjectid', 'prev_subjectid', 'nadm', 'adm', 'dis', 'base_time'], inplace=True)


VISIT_OCCURRENCE TABLE...

In [33]:
df_visit_occ

Unnamed: 0,VISIT_OCCURRENCE_ID,PERSON_ID,hadm_id,VISIT_CONCEPT_ID,VISIT_START_DATETIME,VISIT_START_DATE,VISIT_END_DATETIME,VISIT_END_DATE,VISIT_TYPE_CONCEPT_ID,PRECEDING_VISIT_OCCURRENCE_ID
0,1000001,1000001,1,9201,2010-12-29 06:23:00,2010-12-29,2011-01-08 06:23:00,2011-01-08,32817,
1,1000002,1000002,2,9201,2010-12-29 11:16:00,2010-12-29,2011-01-01 11:16:00,2011-01-01,32817,
2,1000003,1000003,3,9201,2010-12-31 05:14:00,2010-12-31,2011-02-13 05:14:00,2011-02-13,32817,
3,1000004,1000004,4,9201,2010-12-29 10:58:00,2010-12-29,2011-01-03 10:58:00,2011-01-03,32817,
4,1000005,1000005,5,9201,2010-12-30 10:15:00,2010-12-30,2011-01-01 10:15:00,2011-01-01,32817,
...,...,...,...,...,...,...,...,...,...,...
2491,1002492,1002492,2492,9201,2010-12-30 12:58:00,2010-12-30,2011-01-05 12:58:00,2011-01-05,32817,
2492,1002493,1002493,2493,9201,2010-12-30 16:25:00,2010-12-30,2011-01-01 16:25:00,2011-01-01,32817,
2493,1002494,1002494,2494,9201,2010-12-27 09:04:00,2010-12-27,2011-01-07 09:04:00,2011-01-07,32817,
2494,1002495,1002495,2495,9201,2010-12-29 16:19:00,2010-12-29,2011-01-10 16:19:00,2011-01-10,32817,


# VISIT_DETAIL

In [11]:
### VISIT_DETAIL TABLE ###
# Match ICU_ADMIN record
# Since there is no icu_in, icu_out time, we use hospital admission, discharge time as visit detail start, end time.
# Create a new DataFrame for VISIT_DETAIL data with the specified columns
df_visit_detail = pd.DataFrame(columns=['VISIT_DETAIL_ID'])

# Populate the PERSON_ID and MRN columns with data from the df_person DataFrame
df_visit_detail['PERSON_ID'] = df_person['PERSON_ID']
df_visit_detail['subjectid'] = df_person['PERSON_SOURCE_VALUE']

# Integrate visit detail data with operational data from df_op using 'MRN'
usecols = ['hadm_id', 'subjectid', 'adm', 'dis', 'icu_days', 'base_time']
df_visit_detail = df_visit_detail.merge(vitaldb_info[usecols], on='subjectid')

# Remove rows that do not have ICU record
df_visit_detail = df_visit_detail[df_visit_detail['icu_days']==0].reset_index(drop=True)

# Generate unique sequential IDs for VISIT_DETAIL_ID
df_visit_detail['VISIT_DETAIL_ID'] = start_index['visit_detail'] + np.arange(len(df_visit_detail)) + 1

# Designate a concept ID representing ICU visits
df_visit_detail['VISIT_DETAIL_CONCEPT_ID'] = 32037

# Calculate visit start and end datetime values using base_date and ICU admission/discharge times
base_date = datetime(2011, 1, 1) + pd.to_timedelta(df_visit_detail['base_time'], unit='sec')
# Assign visit start and end datetime values to hosptial admission, discharge times
df_visit_detail['VISIT_DETAIL_START_DATETIME'] = base_date + pd.to_timedelta(df_visit_detail['adm'], unit='sec')
df_visit_detail['VISIT_DETAIL_START_DATE'] = df_visit_detail['VISIT_DETAIL_START_DATETIME'].dt.date
df_visit_detail['VISIT_DETAIL_END_DATETIME'] = base_date + pd.to_timedelta(df_visit_detail['dis'], unit='sec')
df_visit_detail['VISIT_DETAIL_END_DATE'] = df_visit_detail['VISIT_DETAIL_END_DATETIME'].dt.date

# Specify the concept ID for the visit detail type as sourced from EHR
df_visit_detail['VISIT_DETAIL_TYPE_CONCEPT_ID'] = 32817

# Merge with df_visit_occ to fetch 'VISIT_OCCURRENCE_ID' values
df_visit_detail['VISIT_OCCURRENCE_ID'] = df_visit_detail.merge(df_visit_occ[['hadm_id', 'VISIT_OCCURRENCE_ID']], on='hadm_id', how='left')['VISIT_OCCURRENCE_ID']

# Remove columns that aren't part of the final VISIT_DETAIL table
df_visit_detail.drop(columns=usecols[1:], inplace=True)


In [35]:
df_visit_detail

Unnamed: 0,VISIT_DETAIL_ID,PERSON_ID,hadm_id,VISIT_DETAIL_CONCEPT_ID,VISIT_DETAIL_START_DATETIME,VISIT_DETAIL_START_DATE,VISIT_DETAIL_END_DATETIME,VISIT_DETAIL_END_DATE,VISIT_DETAIL_TYPE_CONCEPT_ID,VISIT_OCCURRENCE_ID
0,1000001,1000001,1,32037,2010-12-29 06:23:00,2010-12-29,2011-01-08 06:23:00,2011-01-08,32817,1000001
1,1000002,1000002,2,32037,2010-12-29 11:16:00,2010-12-29,2011-01-01 11:16:00,2011-01-01,32817,1000002
2,1000003,1000004,4,32037,2010-12-29 10:58:00,2010-12-29,2011-01-03 10:58:00,2011-01-03,32817,1000004
3,1000004,1000005,5,32037,2010-12-30 10:15:00,2010-12-30,2011-01-01 10:15:00,2011-01-01,32817,1000005
4,1000005,1000007,7,32037,2010-12-30 06:14:00,2010-12-30,2011-01-01 06:14:00,2011-01-01,32817,1000007
...,...,...,...,...,...,...,...,...,...,...
2059,1002060,1002490,2490,32037,2010-12-27 15:57:00,2010-12-27,2011-01-13 15:57:00,2011-01-13,32817,1002490
2060,1002061,1002493,2493,32037,2010-12-30 16:25:00,2010-12-30,2011-01-01 16:25:00,2011-01-01,32817,1002493
2061,1002062,1002494,2494,32037,2010-12-27 09:04:00,2010-12-27,2011-01-07 09:04:00,2011-01-07,32817,1002494
2062,1002063,1002495,2495,32037,2010-12-29 16:19:00,2010-12-29,2011-01-10 16:19:00,2011-01-10,32817,1002495


In [304]:
len(vitaldb_info['hadm_id'].unique())

2496

# CONDITION_OCCURRENCE

In [12]:
### CONDITION_OCCURRENCE TABLE ###
print('CONDITION_OCCURRENCE TABLE...', end='')
# Create an empty DataFrame using the predefined column names
df_cond_occ = pd.DataFrame(columns=['CONDITION_OCCURRENCE_ID'])

# Map PERSON_ID values from the df_person DataFrame to the new CONDITION_OCCURRENCE DataFrame
df_cond_occ['PERSON_ID'] = df_person['PERSON_ID']

# Transfer subject_id values (stored as PERSON_SOURCE_VALUE) from df_person to df_cond_occ
df_cond_occ['subjectid'] = df_person['PERSON_SOURCE_VALUE']

# Merge the diagnosis information from information table
usecols = ['subjectid', 'hadm_id', 'dx', 'adm', 'dis', 'base_time']
df_cond_occ = df_cond_occ.merge(vitaldb_info[usecols], on='subjectid')

# Assign sequential IDs starting from 1 to CONDITION_OCCURRENCE_ID column
df_cond_occ['CONDITION_OCCURRENCE_ID'] = start_index['condition_occurrence'] + np.arange(len(df_cond_occ)) + 1

# Define the base date
base_date = datetime(2011, 1, 1) + pd.to_timedelta(df_cond_occ['base_time'], unit='sec')
# Set condition_start_date as hospital admission time since there is no condition start_time.
df_cond_occ['CONDITION_START_DATETIME'] = base_date + pd.to_timedelta(df_cond_occ['adm'], unit='sec')
df_cond_occ['CONDITION_START_DATE'] = pd.to_datetime(df_cond_occ['CONDITION_START_DATETIME'].dt.date)

# Set condition_end_date as hospital discharge time since there is no condition_end_time.
df_cond_occ['CONDITION_END_DATETIME'] = base_date + pd.to_timedelta(df_cond_occ['dis'], unit='sec')
df_cond_occ['CONDITION_END_DATE'] = pd.to_datetime(df_cond_occ['CONDITION_END_DATETIME'].dt.date)

# Assign the CONDITION_TYPE_CONCEPT_ID indicating the data source is an EHR
df_cond_occ['CONDITION_TYPE_CONCEPT_ID'] = 32817

## Match visit_occurrence_id, visit_detail based on hadm_id
# It is possible since time span of visit_occ, visit_detail id are same with hadm_id 
df_cond_occ = df_cond_occ.merge(df_visit_occ[['hadm_id', 'VISIT_OCCURRENCE_ID']], on='hadm_id', how='left')
df_cond_occ = df_cond_occ.merge(df_visit_detail[['hadm_id', 'VISIT_DETAIL_ID']], on='hadm_id', how='left')

# Remove columns that aren't part of the CONDITION_OCCURRENCE table format
df_cond_occ.drop(columns=usecols, inplace=True)
df_cond_occ = df_cond_occ.astype({'VISIT_OCCURRENCE_ID':'Int64', 'VISIT_DETAIL_ID':'Int64'})

# Save the final df_cond_occ DataFrame to a parqeut file
df_cond_occ.to_parquet(f'{save_path}/{dname}_CONDITION_OCCURRENCE.parquet')
if save_csv:
    df_cond_occ.to_csv(f'{save_path}/{dname}_CONDITION_OCCURRENCE.csv', index=False)
print('done')

CONDITION_OCCURRENCE TABLE...done


In [371]:
df_cond_occ

Unnamed: 0,CONDITION_OCCURRENCE_ID,PERSON_ID,CONDITION_START_DATETIME,CONDITION_START_DATE,CONDITION_END_DATETIME,CONDITION_END_DATE,CONDITION_TYPE_CONCEPT_ID,VISIT_OCCURRENCE_ID,VISIT_DETAIL_ID
0,1000001,1000001,2010-12-29 06:23:00,2010-12-29,2011-01-08 06:23:00,2011-01-08,32817,1000001,1000001
1,1000002,1000002,2010-12-29 11:16:00,2010-12-29,2011-01-01 11:16:00,2011-01-01,32817,1000002,1000002
2,1000003,1000003,2010-12-31 05:14:00,2010-12-31,2011-02-13 05:14:00,2011-02-13,32817,1000003,
3,1000004,1000004,2010-12-29 10:58:00,2010-12-29,2011-01-03 10:58:00,2011-01-03,32817,1000004,1000003
4,1000005,1000005,2010-12-30 10:15:00,2010-12-30,2011-01-01 10:15:00,2011-01-01,32817,1000005,1000004
...,...,...,...,...,...,...,...,...,...
2491,1002492,1002492,2010-12-30 12:58:00,2010-12-30,2011-01-05 12:58:00,2011-01-05,32817,1002492,
2492,1002493,1002493,2010-12-30 16:25:00,2010-12-30,2011-01-01 16:25:00,2011-01-01,32817,1002493,1002061
2493,1002494,1002494,2010-12-27 09:04:00,2010-12-27,2011-01-07 09:04:00,2011-01-07,32817,1002494,1002062
2494,1002495,1002495,2010-12-29 16:19:00,2010-12-29,2011-01-10 16:19:00,2011-01-10,32817,1002495,1002063


# DRUG_EXPOSURE

In [46]:
### DRUG_EXPOSURE TABLE ###
print('DRUG_EXPOSURE TABLE...', end='')
# Create an empty DataFrame using the predefined column names
df_drug = pd.DataFrame(columns = ['DRUG_EXPOSURE_ID'])

# Copy PERSON_ID values from the PERSON table to the DRUG_EXPOSURE table
df_drug['PERSON_ID'] = df_person['PERSON_ID']
# Copy PERSON_SOURCE_VALUE values as MRN from df_person to df_drug
df_drug['subjectid'] = df_person['PERSON_SOURCE_VALUE']

# Merge the drug information from source drugs table
usecols = ['subjectid', 'hadm_id', 'value', 'opstart', 'opend', 'Label', 'concept_id', 'Unit', 'unit_concept_id', 'base_time']
df_drug = df_drug.merge(vitaldb_drugs[usecols], on='subjectid', how='left')

# Assign unique sequential IDs to the 'DRUG_EXPOSURE_ID' column
df_drug['DRUG_EXPOSURE_ID'] = start_index['drug_exposure'] + np.arange(len(df_drug)) + 1
# Map 'concept_id' values to 'DRUG_CONCEPT_ID' column
df_drug['DRUG_CONCEPT_ID'] = df_drug['concept_id']

# Define the reference date for start and end times
base_date = datetime(2011, 1, 1) + pd.to_timedelta(df_drug['base_time'], unit='sec')
# Convert 'opstart' values (in seconds) to dates using the reference base_date
df_drug['DRUG_EXPOSURE_START_DATETIME'] = pd.to_datetime(base_date) + pd.to_timedelta(df_drug['opstart'], unit='sec')
# Extract the date part for DRUG_EXPOSURE_START_DATE
df_drug['DRUG_EXPOSURE_START_DATE'] = pd.to_datetime(df_drug['DRUG_EXPOSURE_START_DATETIME'].dt.date)
# Assign the start datetime to the end datetime column (assuming no gap)
df_drug['DRUG_EXPOSURE_END_DATETIME'] = pd.to_datetime(base_date) + pd.to_timedelta(df_drug['opend'], unit='sec')
# Assign the start date to the end date column
df_drug['DRUG_EXPOSURE_END_DATE'] = pd.to_datetime(df_drug['DRUG_EXPOSURE_END_DATETIME'].dt.date)

# Assign 32838 (EHR Episode REcord) for Intra-OP record
df_drug['DRUG_TYPE_CONCEPT_ID'] = 32838

# Map drug quantity values from 'value' column
df_drug['QUANTITY'] = df_drug['value']
# Map drug administration route to intravenous route
df_drug['ROUTE_CONCEPT_ID'] = 4171047

## Match visit_occurrence_id, visit_detail based on hadm_id
df_drug = df_drug.merge(df_visit_occ[['hadm_id', 'VISIT_OCCURRENCE_ID']], on='hadm_id', how='left')
df_drug = df_drug.merge(df_visit_detail[['hadm_id', 'VISIT_DETAIL_ID']], on='hadm_id', how='left')

# Map drug names to the 'DRUG_SOURCE_VALUE' column
df_drug['DRUG_SOURCE_VALUE'] = df_drug['Label']

# Map drug dose units
df_drug['DOSE_UNIT_CONCEPT_ID'] = df_drug['unit_concept_id'].astype('Int64')
df_drug['DOSE_UNIT_SOURCE_VALUE'] = df_drug['Unit']

# Remove columns that aren't part of the DRUG_EXPOSURE table format
df_drug.drop(columns=usecols, inplace=True)
df_drug = df_drug.astype({'DRUG_CONCEPT_ID': 'Int64', 'DRUG_TYPE_CONCEPT_ID': 'Int64', 'ROUTE_CONCEPT_ID': 'Int64', 'VISIT_OCCURRENCE_ID':'Int64', 'VISIT_DETAIL_ID':'Int64'})

# Save the final df_drug DataFrame to a parqeut file
df_drug.to_csv(f'{save_path}/{dname}_DRUG_EXPOSURE.csv', index=False)
df_drug.to_parquet(f'{save_path}/{dname}_DRUG_EXPOSURE.parquet')    

DRUG_EXPOSURE TABLE...

In [47]:
df_drug

Unnamed: 0,DRUG_EXPOSURE_ID,PERSON_ID,DRUG_CONCEPT_ID,DRUG_EXPOSURE_START_DATETIME,DRUG_EXPOSURE_START_DATE,DRUG_EXPOSURE_END_DATETIME,DRUG_EXPOSURE_END_DATE,DRUG_TYPE_CONCEPT_ID,QUANTITY,ROUTE_CONCEPT_ID,VISIT_OCCURRENCE_ID,VISIT_DETAIL_ID,DRUG_SOURCE_VALUE,DOSE_UNIT_CONCEPT_ID,DOSE_UNIT_SOURCE_VALUE
0,1000001,1000001,753626,2011-01-01 00:27:48,2011-01-01,2011-01-01 02:52:48,2011-01-01,32838,120.0,4171047,1000001,1000001,intraop_ppf,8576,mg
1,1000002,1000001,1154029,2011-01-01 00:27:48,2011-01-01,2011-01-01 02:52:48,2011-01-01,32838,100.0,4171047,1000001,1000001,intraop_ftn,9655,ug
2,1000003,1000001,19003953,2011-01-01 00:27:48,2011-01-01,2011-01-01 02:52:48,2011-01-01,32838,70.0,4171047,1000001,1000001,intraop_rocu,8576,mg
3,1000004,1000001,1143374,2011-01-01 00:27:48,2011-01-01,2011-01-01 02:52:48,2011-01-01,32838,10.0,4171047,1000001,1000001,intraop_eph,8576,mg
4,1000005,1000002,19003953,2011-01-01 00:18:10,2011-01-01,2011-01-01 00:50:10,2011-01-01,32838,50.0,4171047,1000002,1000002,intraop_rocu,8576,mg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5841,1005842,1002495,19003953,2011-01-01 00:58:53,2011-01-01,2011-01-01 05:03:53,2011-01-01,32838,130.0,4171047,1002495,1002063,intraop_rocu,8576,mg
5842,1005843,1002495,1143374,2011-01-01 00:58:53,2011-01-01,2011-01-01 05:03:53,2011-01-01,32838,10.0,4171047,1002495,1002063,intraop_eph,8576,mg
5843,1005844,1002496,753626,2011-01-01 00:38:41,2011-01-01,2011-01-01 02:33:41,2011-01-01,32838,120.0,4171047,1002496,1002064,intraop_ppf,8576,mg
5844,1005845,1002496,19003953,2011-01-01 00:38:41,2011-01-01,2011-01-01 02:33:41,2011-01-01,32838,90.0,4171047,1002496,1002064,intraop_rocu,8576,mg


# PROCEDURE_OCCURRENCE

In [312]:
vitaldb_info[['optype', 'dx', 'opname', 'approach', 'position']]

Unnamed: 0,optype,dx,opname,approach,position
0,Colorectal,Rectal cancer,Low anterior resection,Open,Lithotomy
1,Biliary/Pancreas,Gallbladder stone,Cholecystectomy,Videoscopic,Reverse Trendelenburg
2,Vascular,Aortic aneurysm,Aneurysmal repair,Open,Prone
3,Breast,Malignant neoplasm of breast left,Breast-conserving surgery,Open,Supine
4,Biliary/Pancreas,Gallbladder stone,Cholecystectomy,Videoscopic,Supine
...,...,...,...,...,...
2491,Major resection,Lung cancer right,Lung lobectomy,Videoscopic,Left lateral decubitus
2492,Vascular,Venous insufficiency with swelling or pain,Ligation and stripping,Open,Supine
2493,Hepatic,Hepatocellular carcinoma,Hemihepatectomy,Open,Supine
2494,Stomach,Early gastric cancer,Distal gastrectomy,Videoscopic,Supine


## maps standard concept id (icd10pcs) to opname (in text)

In [44]:
vd_info = pd.read_csv('vitaldb/vitaldb_info_procedure(dx+opname)_mapped.csv')
#vd_info.drop(columns=['pred_p', 'pred_o', 'pred_a'], inplace=True)

import csv
import pandas as pd

df_vocab = pd.read_csv(f'vocab/VOCABULARY.csv', sep='\t', on_bad_lines='error')
df_concept_rel = pd.read_csv(f'vocab/CONCEPT_RELATIONSHIP.csv', sep='\t', on_bad_lines='error')
df_concept = pd.read_csv(f'vocab/CONCEPT.csv', sep='\t')

  df_concept = pd.read_csv(f'vocab/CONCEPT.csv', sep='\t')


In [45]:
# Filter the relationship data to only include 'Maps to' relationships
maps_to_df = df_concept_rel[df_concept_rel['relationship_id'] == 'Maps to']

# Merge the operation data with the concept data using the ICD10PCS vocabulary to obtain the source_concept_id
merged_data = vd_info.merge(df_concept[df_concept['vocabulary_id'] == 'ICD10PCS'], left_on='code_pred', right_on='concept_code', how = 'left')

# Merge the resulting data with the 'maps_to' relationship data to obtain the corresponding standard_concept_id
df_op_mapped = merged_data.merge(maps_to_df, left_on='concept_id', right_on='concept_id_1', how='left')

# Populate the 'source_value', 'source_concept_id', and 'standard_concept_id' columns in the df_op dataframe
#df_op['source_value'] = df_op_mapped['icd10_pcs']
#df_op['source_concept_id'] = df_op_mapped['concept_id']
vd_info['standard_concept_id'] = df_op_mapped['concept_id_2']
vd_info.rename(columns={'standard_concept_id': 'opname_concept_id'}, inplace=True)

In [348]:
vd_info.loc[vd_info['opname_concept_id'].isna(), ['caseid', 'subjectid', 'dx', 'opname', 'code_pred']]

Unnamed: 0,caseid,subjectid,dx,opname,code_pred
11,12,491,Hepatocellular carcinoma,Liver transplantation,02Y60
27,28,2782,Invasive thymoma,Thymectomy,00T00
28,29,3720,Living liver donor,Donor hepatectomy,02Y60
51,52,1724,End stage renal disease,Kidney transplantation,02Y60
53,54,1517,Kidney donor,Donor nephrectomy,02Y60
...,...,...,...,...,...
6345,6346,5525,End stage renal disease,Kidney transplantation,02Y60
6348,6349,4805,Epidermal cyst,Wide excision,00T00
6362,6363,5396,Liver cirrhosis,Liver transplantation,02Y60
6366,6367,2349,Low back pain site unspecified,Excision,00T00


In [50]:
vd_info.loc[vd_info['opname_concept_id'].isna(), ['caseid', 'subjectid', 'dx', 'opname', 'code_pred']]

Unnamed: 0,caseid,subjectid,dx,opname,code_pred
10,11,3963,Gross hematuria,Transurethral resection of bladder tumor,0HB00
11,12,491,Hepatocellular carcinoma,Liver transplantation,0TYB0
12,13,531,"Abnormal chest CT, lung nodule",Metastasectomy,0HB00
28,29,3720,Living liver donor,Donor hepatectomy,0TYB0
30,31,4194,"Abnormal chest CT, lung",Lung wedge resection,0HB00
...,...,...,...,...,...
6362,6363,5396,Liver cirrhosis,Liver transplantation,0TYB0
6369,6370,1271,Solitary pulmonary nodule,Lung wedge resection,0HB00
6372,6373,3314,Pulmonary sclerosing hemangioma,Lung wedge resection,0HB00
6375,6376,1556,"Abnormal chest CT, lung nodule",Lung wedge resection,0HB00


In [None]:
vd_info.loc[vd_info['opname_concept_id'].isna(), ['caseid', 'subjectid', 'dx', 'opname', 'code_pred']]

In [343]:
vd_info['opname_concept_id'].isna().sum()

1202

# MEASUREMENT

In [15]:
### MEASUREMENT TABLE ###
print('MEASUREMENT TABLE...', end='') 
# Create an empty DataFrame using the predefined column names
df_measure = pd.DataFrame(columns = ['MEASUREMENT_ID'])

# Copy PERSON_ID values from the PERSON table to the MEASUREMENT table
df_measure['PERSON_ID'] = df_person['PERSON_ID']
# Copy PERSON_SOURCE_VALUE values as MRN from df_person to df_measure
df_measure['subjectid'] = df_person['PERSON_SOURCE_VALUE']

# Merge caseid from information table
usecols = ['caseid', 'subjectid', 'hadm_id', 'base_time']
df_measure = df_measure.merge(vitaldb_info[usecols], on='subjectid', how='left')

## Get measurement data from vital files
#vitaldb_meas['record_type'] = 'INTRA-OP'
vf_measure = df_measure.merge(vitaldb_meas, on='caseid', how='left')

## Get measurement data from labs table
#vitaldb_labs['record_type'] = 'PERI-OP'
labs_measure = df_measure.merge(vitaldb_labs, on='caseid', how='left')

# Concat measurement data from vital files and labs table
df_measure = pd.concat([vf_measure, labs_measure], axis=0)
df_measure.reset_index(drop=True, inplace=True)

# Extract and set the relevant concept and datetime details for each measurement
df_measure['MEASUREMENT_CONCEPT_ID'] = df_measure['concept_id']

base_date = datetime(2011, 1, 1) + pd.to_timedelta(df_measure['base_time'])
df_measure['MEASUREMENT_DATETIME'] = base_date + pd.to_timedelta(df_measure['dt'], unit='sec')
df_measure['MEASUREMENT_DATE'] = df_measure['MEASUREMENT_DATETIME'].dt.date
# Assign 32838 (EHR Episode Record) for Intra-Op record, and 32817 (EHR) for others (Post-Op, Pre-Op).
df_measure['MEASUREMENT_TYPE_CONCEPT_ID'] = 32817 # df_measure['record_type'].map({'INTRA-OP': 32838 , 'PERI-OP': 32817})
df_measure['OPERATOR_CONCEPT_ID'] = 4172703  # '=' operation

# Drop rows that have non-valid meas_value (not float)
print('Removing invalid rows...', end='')
tlen = len(df_measure)
df_measure['result'] = pd.to_numeric(df_measure['result'], errors='coerce')
df_measure.dropna(subset='value', inplace=True)
print(f'removed {tlen-len(df_measure)} rows out of {tlen} rows ...', end='')

# Handle special cases for 'VALUE_AS_NUMBER' based on specific concept IDs
# In ETL conventions, it is recommended to set the VALUE_AS_NUMBER to NULL when the value from source data is negative with the exceptions below 
#exceptions = [3003396, 3002032, 3006277, 3012501, 3003129, 3004959, 3007435]
#valid_mask = (df_measure['result'] >= 0) | df_measure['concept_id'].isin(exceptions)
#df_measure.loc[valid_mask, 'VALUE_AS_NUMBER'] = df_measure['result']
#df_measure.loc[~valid_mask, 'VALUE_AS_NUMBER'] = None
df_measure['VALUE_AS_NUMBER'] = df_measure['result']


# Assign unique MEASUREMENT_IDs to each row
df_measure['MEASUREMENT_ID'] = start_index['measurement'] + np.arange(1, len(df_measure) + 1)

# Set the 'UNIT_CONCEPT_ID' values
df_measure['UNIT_CONCEPT_ID'] = df_measure['unit_concept_id']

## Match visit_occurrence_id, visit_detail based on hadm_id
df_measure = df_measure.merge(df_visit_occ[['hadm_id', 'VISIT_OCCURRENCE_ID']], on='hadm_id', how='left')
df_measure = df_measure.merge(df_visit_detail[['hadm_id', 'VISIT_DETAIL_ID']], on='hadm_id', how='left')

# Retain only the relevant columns in the final MEASUREMENT table
df_measure.drop(columns=usecols + list(vitaldb_labs.columns), inplace=True)
df_measure = df_measure.astype({'MEASUREMENT_CONCEPT_ID': 'Int64', 'UNIT_CONCEPT_ID':'Int64', 'VISIT_OCCURRENCE_ID':'Int64', 'VISIT_DETAIL_ID':'Int64'})

# Export the final MEASUREMENT table to CSV
df_measure.to_parquet(f'{save_path}/{dname}_MEASUREMENT.parquet')
if save_csv:
    df_measure.to_csv(f'{save_path}/{dname}_MEASUREMENT.csv', index=False)
df_measure[:1000].to_csv(f'{save_path}/sample/{dname}_MEASUREMENT.csv', index=False)

Removing invalid rows...removed 272 rows out of 316077861 rows ...

ValueError: operands could not be broadcast together with shapes (4,) (7,) 

In [19]:
df_measure.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316077589 entries, 0 to 316077588
Data columns (total 11 columns):
 #   Column                       Dtype         
---  ------                       -----         
 0   MEASUREMENT_ID               int64         
 1   PERSON_ID                    int64         
 2   MEASUREMENT_CONCEPT_ID       Int64         
 3   MEASUREMENT_DATETIME         datetime64[ns]
 4   MEASUREMENT_DATE             object        
 5   MEASUREMENT_TYPE_CONCEPT_ID  int64         
 6   OPERATOR_CONCEPT_ID          int64         
 7   VALUE_AS_NUMBER              float64       
 8   UNIT_CONCEPT_ID              Int64         
 9   VISIT_OCCURRENCE_ID          Int64         
 10  VISIT_DETAIL_ID              Int64         
dtypes: Int64(4), datetime64[ns](1), float64(1), int64(4), object(1)
memory usage: 27.1+ GB


In [24]:
df_measure[:1000]

Unnamed: 0,MEASUREMENT_ID,PERSON_ID,MEASUREMENT_CONCEPT_ID,MEASUREMENT_DATETIME,MEASUREMENT_DATE,MEASUREMENT_TYPE_CONCEPT_ID,OPERATOR_CONCEPT_ID,VALUE_AS_NUMBER,UNIT_CONCEPT_ID,VISIT_OCCURRENCE_ID,VISIT_DETAIL_ID
0,1000001,1000001,21490711,2011-01-01 00:00:00.156,2011-01-01,32817,4172703,0.000000,0,1000001,1000001
1,1000002,1000001,21490711,2011-01-01 00:00:01.156,2011-01-01,32817,4172703,0.000000,0,1000001,1000001
2,1000003,1000001,21490711,2011-01-01 00:00:02.156,2011-01-01,32817,4172703,0.000000,0,1000001,1000001
3,1000004,1000001,21490711,2011-01-01 00:00:03.156,2011-01-01,32817,4172703,0.000000,0,1000001,1000001
4,1000005,1000001,21490711,2011-01-01 00:00:04.156,2011-01-01,32817,4172703,0.000000,0,1000001,1000001
...,...,...,...,...,...,...,...,...,...,...,...
995,1000996,1000001,21490711,2011-01-01 00:16:35.156,2011-01-01,32817,4172703,32.500000,0,1000001,1000001
996,1000997,1000001,21490711,2011-01-01 00:16:36.156,2011-01-01,32817,4172703,32.099998,0,1000001,1000001
997,1000998,1000001,21490711,2011-01-01 00:16:37.156,2011-01-01,32817,4172703,33.799999,0,1000001,1000001
998,1000999,1000001,21490711,2011-01-01 00:16:38.156,2011-01-01,32817,4172703,33.599998,0,1000001,1000001


In [56]:
vitaldb_meas.info()

<class 'pandas.core.frame.DataFrame'>
Index: 315761528 entries, 0 to 118738
Data columns (total 7 columns):
 #   Column           Dtype  
---  ------           -----  
 0   caseid           int64  
 1   name             object 
 2   dt               float64
 3   result           float32
 4   concept_id       Int64  
 5   Unit             object 
 6   unit_concept_id  Int64  
dtypes: Int64(2), float32(1), float64(1), int64(1), object(2)
memory usage: 18.2+ GB


# DEATH

In [30]:
### DEATH ###
print('DEATH TABLE...', end='')
# Create an empty dataframe for DEATH table
df_death = pd.DataFrame()

# Populate 'PERSON_ID' and 'subjectid' columns in the DEATH table from the PERSON table
df_death['PERSON_ID'] = df_person['PERSON_ID']
df_death['subjectid'] = df_person['PERSON_SOURCE_VALUE']

# Merge 'inhosp_death_time' from the operations (df_op) table into the DEATH table using 'subject_id'
usecols = ['subjectid', 'death_inhosp', 'dis', 'base_time']
df_death = df_death.merge(vitaldb_info[usecols], on='subjectid', how='left')
df_death = df_death[df_death['death_inhosp']==1]

# Define the reference date for datetime calculations
base_date = datetime(2011, 1, 1) + pd.to_timedelta(df_death['base_time'], unit='sec')

# Convert in-hospital death times to actual datetime objects using the base_date as the reference point
df_death['DEATH_DATETIME'] = pd.to_datetime(base_date) + pd.to_timedelta(df_death['dis'], unit='sec')
df_death['DEATH_DATE'] = pd.to_datetime(df_death['DEATH_DATETIME'].dt.date)

# Set the DEATH_TYPE_CONCEPT_ID to represent data sourced from an Electronic Health Record (EHR)
df_death['DEATH_TYPE_CONCEPT_ID'] = 32817

# Retain only the relevant columns in the final DEATH table
df_death.drop(columns = usecols, inplace=True)
df_death.reset_index(drop=True, inplace=True)

# Save the final df_drug DataFrame to a CSV file
df_death.to_parquet(f'{save_path}/{dname}_DEATH.parquet')
if save_csv:
    df_death.to_csv(f'{save_path}/{dname}_DEATH.csv', index=False)
df_death[:1000].to_csv(f'{save_path}/sample/{dname}_DEATH.csv', index=False)

DEATH TABLE...

In [31]:
df_death

Unnamed: 0,PERSON_ID,DEATH_DATETIME,DEATH_DATE,DEATH_TYPE_CONCEPT_ID
0,1000168,2011-01-10 10:27:00,2011-01-10,32817
1,1000199,2011-01-03 11:25:00,2011-01-03,32817
2,1000337,2011-01-07 16:02:00,2011-01-07,32817
3,1000723,2011-01-01 13:41:00,2011-01-01,32817
4,1000781,2011-01-03 16:22:00,2011-01-03,32817
5,1000820,2011-04-06 07:46:00,2011-04-06,32817
6,1000859,2011-01-07 12:20:00,2011-01-07,32817
7,1000982,2011-01-07 16:09:00,2011-01-07,32817
8,1001049,2011-01-11 15:54:00,2011-01-11,32817
9,1001099,2011-01-01 14:15:00,2011-01-01,32817


# NOTE

In [37]:
### NOTE TABLE ###
print('NOTE TABLE...', end='')
# Create an empty dataframe for NOTE table
df_note = pd.DataFrame(columns=['NOTE_ID'])

# Populate 'PERSON_ID' and 'subject_id' columns in the NOTE table from the PERSON table
df_note['PERSON_ID'] = df_person['PERSON_ID']
df_note['subjectid'] = df_person['PERSON_SOURCE_VALUE']


# residual fields that are not mapped in operation table
res_fields = ['height', 'weight', 'asa', 'emop', 'department', 'ane_type', 'anestart', 'aneend']
res_op = pd.melt(vitaldb_info, id_vars=['subjectid', 'hadm_id', 'opstart', 'base_time'], value_vars=res_fields)
df_note = df_note.merge(res_op, on='subjectid', how='left')
df_note.dropna(subset='value', inplace=True, ignore_index=True)

# Assign unique sequential IDs to the 'NOTE_ID' column
df_note['NOTE_ID'] = start_index['note'] + np.arange(len(df_note)) + 1

base_date = datetime(2011, 1, 1) + pd.to_timedelta(df_note['base_time'], unit='sec')
df_note['NOTE_DATETIME'] = base_date + pd.to_timedelta(df_note['opstart'], unit='sec')
df_note['NOTE_DATE'] = df_note['NOTE_DATETIME'].dt.date

# Set the NOTE_TYPE_CONCEPT_ID to represent data sourced from an Electronic Health Record (EHR)
df_note['NOTE_TYPE_CONCEPT_ID'] = 32817

# Use the concept id  706617(Anesthesiology) or 706502(Surgical operation).
res_ane = ['asa', 'antype','anstart_time', 'anend_time']
df_note.loc[df_note['variable'].isin(res_ane), 'NOTE_CLASS_CONCEPT_ID'] = 706617
df_note.loc[~df_note['variable'].isin(res_ane), 'NOTE_CLASS_CONCEPT_ID'] = 706502
df_note['NOTE_CLASS_CONCEPT_ID'] = df_note['NOTE_CLASS_CONCEPT_ID'].astype('Int32')

df_note['NOTE_TITLE'] = df_note['variable']
df_note['NOTE_TEXT'] = df_note['value'].astype('str')

# Use the concept_id 32678(UTF-8)
df_note['ENCODING_CONCEPT_ID'] = 32678

# Use the concept_id 4180186(English language)
df_note['LANGUAGE_CONCEPT_ID'] = 4180186 

# Matches visit_occurrence_id and visit_detail_id based on chart_time
df_note = df_note.merge(df_visit_occ[['hadm_id', 'VISIT_OCCURRENCE_ID']], on='hadm_id', how='left')
df_note = df_note.merge(df_visit_detail[['hadm_id', 'VISIT_DETAIL_ID']], on='hadm_id', how='left')

# Retain only the relevant columns in the final NOTE table
df_note = df_note.drop(columns=['subjectid', 'hadm_id', 'opstart', 'variable', 'value', 'base_time'])

# Export the final NOTE table to CSV
df_note.to_parquet(f'{save_path}/{dname}_NOTE.parquet')
if save_csv:
    df_note.to_csv(f'{save_path}/{dname}_NOTE.csv', index=False)
df_note[:1000].to_csv(f'{save_path}/sample/{dname}_NOTE.csv', index=False)
print('done')

NOTE TABLE...done


In [38]:
df_note

Unnamed: 0,NOTE_ID,PERSON_ID,NOTE_DATETIME,NOTE_DATE,NOTE_TYPE_CONCEPT_ID,NOTE_CLASS_CONCEPT_ID,NOTE_TITLE,NOTE_TEXT,ENCODING_CONCEPT_ID,LANGUAGE_CONCEPT_ID,VISIT_OCCURRENCE_ID,VISIT_DETAIL_ID
0,1000001,1000001,2011-01-01 00:27:48,2011-01-01,32817,706502,height,160.2,32678,4180186,1000001,1000001.0
1,1000002,1000001,2011-01-01 00:27:48,2011-01-01,32817,706502,weight,67.5,32678,4180186,1000001,1000001.0
2,1000003,1000001,2011-01-01 00:27:48,2011-01-01,32817,706617,asa,2.0,32678,4180186,1000001,1000001.0
3,1000004,1000001,2011-01-01 00:27:48,2011-01-01,32817,706502,emop,0,32678,4180186,1000001,1000001.0
4,1000005,1000001,2011-01-01 00:27:48,2011-01-01,32817,706502,department,General surgery,32678,4180186,1000001,1000001.0
...,...,...,...,...,...,...,...,...,...,...,...,...
19910,1019911,1002496,2011-01-01 00:38:41,2011-01-01,32817,706502,emop,0,32678,4180186,1002496,1002064.0
19911,1019912,1002496,2011-01-01 00:38:41,2011-01-01,32817,706502,department,Gynecology,32678,4180186,1002496,1002064.0
19912,1019913,1002496,2011-01-01 00:38:41,2011-01-01,32817,706502,ane_type,General,32678,4180186,1002496,1002064.0
19913,1019914,1002496,2011-01-01 00:38:41,2011-01-01,32817,706502,anestart,-79,32678,4180186,1002496,1002064.0


In [39]:
# Drop hadm_id column from VISIT_OCCURRENCE and VISIT_DETAIL tables
df_visit_occ.drop(columns='hadm_id', inplace=True)
df_visit_detail.drop(columns='hadm_id', inplace=True)

print('saving visit_occurrence and visit_detail tables...', end='')
# Save the processed data to a parquet file
df_visit_occ.to_csv(f'{save_path}/{dname}_VISIT_OCCURRENCE.parquet')
if save_csv:
    df_visit_occ.to_csv(f'{save_path}/{dname}_VISIT_OCCURRENCE.csv', index=False)
df_visit_occ[:1000].to_csv(f'{save_path}/sample/{dname}_VISIT_OCCURRENCE.csv', index=False)

# Save the processed VISIT_DETAIL table to a parquet file
df_visit_detail.to_csv(f'{save_path}/{dname}_VISIT_DETAIL.parquet')
if save_csv:
    df_visit_detail.to_csv(f'{save_path}/{dname}_VISIT_DETAIL.csv', index=False)
df_visit_detail[:1000].to_csv(f'{save_path}/sample/{dname}_VISIT_DETAIL.csv', index=False)    
print('done')

saving visit_occurrence and visit_detail tables...done


In [40]:
df_visit_occ

Unnamed: 0,VISIT_OCCURRENCE_ID,PERSON_ID,VISIT_CONCEPT_ID,VISIT_START_DATETIME,VISIT_START_DATE,VISIT_END_DATETIME,VISIT_END_DATE,VISIT_TYPE_CONCEPT_ID,PRECEDING_VISIT_OCCURRENCE_ID
0,1000001,1000001,9201,2010-12-29 06:23:00,2010-12-29,2011-01-08 06:23:00,2011-01-08,32817,
1,1000002,1000002,9201,2010-12-29 11:16:00,2010-12-29,2011-01-01 11:16:00,2011-01-01,32817,
2,1000003,1000003,9201,2010-12-31 05:14:00,2010-12-31,2011-02-13 05:14:00,2011-02-13,32817,
3,1000004,1000004,9201,2010-12-29 10:58:00,2010-12-29,2011-01-03 10:58:00,2011-01-03,32817,
4,1000005,1000005,9201,2010-12-30 10:15:00,2010-12-30,2011-01-01 10:15:00,2011-01-01,32817,
...,...,...,...,...,...,...,...,...,...
2491,1002492,1002492,9201,2010-12-30 12:58:00,2010-12-30,2011-01-05 12:58:00,2011-01-05,32817,
2492,1002493,1002493,9201,2010-12-30 16:25:00,2010-12-30,2011-01-01 16:25:00,2011-01-01,32817,
2493,1002494,1002494,9201,2010-12-27 09:04:00,2010-12-27,2011-01-07 09:04:00,2011-01-07,32817,
2494,1002495,1002495,9201,2010-12-29 16:19:00,2010-12-29,2011-01-10 16:19:00,2011-01-10,32817,
