# Loading INSPIRE dataset

In [1]:
# INSPIRE v0.2 (about 65,000 cases, 50% of the surgical cases)
import pandas as pd
import numpy as np

# loading INSPIRE dataset (6 source tables)
input_path = 'inspire_v2'
df_diag = pd.read_csv(f'{input_path}/diagnosis.csv')
df_labs = pd.read_csv(f'{input_path}/labs.csv')
df_medi = pd.read_csv(f'{input_path}/medications.csv')
df_op = pd.read_csv(f'{input_path}/operations.csv')
df_vitals = pd.read_csv(f'{input_path}/vitals.csv')
df_ward = pd.read_csv(f'{input_path}/ward_vitals.csv')


# length of the dataset
print(f'length of the dataset: operations {len(df_op)}, diagnosis {len(df_diag)}, labs {len(df_labs)}, medications {len(df_medi)}, vitals {len(df_vitals)}, ward_vitals {len(df_ward)}')

# total subject_id in the dataset
subject_ids = df_diag['subject_id'].tolist() + df_labs['subject_id'].tolist() + df_medi['subject_id'].tolist() + df_op['subject_id'].tolist() + df_vitals['subject_id'].tolist() + df_ward['subject_id'].tolist()
print(f'total subjects in INSPIRE dataset: {len(np.unique(subject_ids))}')

length of the dataset: operations 131109, diagnosis 4733046, labs 21367131, medications 9926794, vitals 66127940, ward_vitals 42679760
total subjects in INSPIRE dataset: 101469


In [4]:
# INSPIRE v0.1 (only about 52,000 cases, 20% of the surgical cases)
import pandas as pd
import numpy as np

# loading INSPIRE dataset (6 source tables)
input_path = 'inspire'
df_diag = pd.read_csv(f'{input_path}/diagnosis.csv')
df_labs = pd.read_csv(f'{input_path}/labs.csv')
df_medi = pd.read_csv(f'{input_path}/medications.csv')
df_op = pd.read_csv(f'{input_path}/operations.csv')
df_vitals = pd.read_csv(f'{input_path}/vitals.csv')
df_ward = pd.read_csv(f'{input_path}/ward_vitals.csv')

# length of the dataset
print(f'length of the dataset: operations {len(df_op)}, diagnosis {len(df_diag)}, labs {len(df_labs)}, medications {len(df_medi)}, vitals {len(df_vitals)}, ward_vitals {len(df_ward)}')

# total subject_id in the dataset
subject_ids = df_diag['subject_id'].tolist() + df_labs['subject_id'].tolist() + df_medi['subject_id'].tolist() + df_op['subject_id'].tolist() + df_vitals['subject_id'].tolist() + df_ward['subject_id'].tolist()
print(f'total subjects in INSPIRE dataset: {len(np.unique(subject_ids))}')

length of the dataset: operations 52136, diagnosis 1888376, labs 8490061, medications 3887696, vitals 26305136, ward_vitals 16869864
total subjects in INSPIRE dataset: 40459


# PERSON

In [2]:
from tqdm import tqdm
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

random.seed(98)

# columns for PERSON
columns = ['PERSON_ID',
 'GENDER_CONCEPT_ID',
 'YEAR_OF_BIRTH',
 'MONTH_OF_BIRTH',
 'DAY_OF_BIRTH',
 'BIRTH_DATETIME',
 'RACE_CONCEPT_ID',
 'ETHNICITY_CONCEPT_ID',
 'LOCATION_ID',
 'PROVIDER_ID',
 'CARE_SITE_ID',
 'PERSON_SOURCE_VALUE',
 'GENDER_SOURCE_VALUE',
 'GENDER_SOURCE_CONCEPT_ID',
 'RACE_SOURCE_VALUE',
 'RACE_SOURCE_CONCEPT_ID',
 'ETHNICITY_SOURCE_VALUE',
 'ETHNICITY_SOURCE_CONCEPT_ID', 'base_time']

# create a new PERSON table (clinical data table)
df_person = pd.DataFrame(columns=columns)

# Define the start and end dates
start_date = datetime(2010, 1, 1)
end_date = datetime(2020, 12, 31)

idx = 0
# first match: operations table (source) -> PERSON table (CDM V5.4)
for _, row in tqdm(df_op.iterrows(), total = df_op.shape[0]):
    # Add only new subjects in operations.csv
    if row['subject_id'] in df_person['PERSON_ID'].tolist():
        continue

        
    ### PERSON_ID ###
    # Identical with the source data.
    df_person.loc[idx, 'PERSON_ID'] = row['subject_id']
    
    
    ### GENDER_CONCEPT_ID ###
    # When gender = ‘M’ then set GENDER_CONCEPT_ID to 8507, when gender = ‘F’ then set to 8532. 
    # Drop any rows with missing/unknown gender. 
    # These two concepts were chosen as they are the only two standard concepts in the gender domain.
    df_person.loc[idx, 'GENDER_CONCEPT_ID'] = 8507 if row['sex'] == 'M' else (8532 if row['sex'] == 'F' else np.nan)
    
    
    ### YEAR_OF_BIRTH, BIRTH_DATETIME
    # Generate random operation date for each subject_id
    row_subject = df_op[df_op['subject_id'] == row['subject_id']]
    final_opdate = row_subject['opdate'].iloc[-1] / 60 / 24
    
    # Calculate the range in days
    days_range = (end_date - start_date).days - final_opdate
    # Generate a random number of days within the range
    random_days = random.randint(0, days_range)
    # Calculate the random date of first op_date
    random_date = start_date + timedelta(days = random_days)
    # Base time of the time zero (first admission date) for each subject_id
    df_person.loc[idx, 'base_time'] = random_date
    # Calculate the birth year
    df_person.loc[idx, 'YEAR_OF_BIRTH'] = random_date.year - row['age']
    df_person.loc[idx, 'BIRTH_DATETIME'] = random_date - relativedelta(years=row['age'])
            
        
    ### RACE_CONCEPT_ID : all ASIAN data
    df_person.loc[idx, 'RACE_CONCEPT_ID'] = 8515
    
    
    ### LOCATION_ID, PROVIDER_ID
    df_person.loc[idx, 'LOCATION_ID'] = 0
    df_person.loc[idx, 'PROVIDER_ID'] = 0

    
    ### PERSON_SOURCE_VALUE, GENDER_SOURCE_VALUE, RACE_SOURCE_VALUE
    df_person.loc[idx, 'PERSON_SOURCE_VALUE'] = row['subject_id']
    df_person.loc[idx, 'GENDER_SOURCE_VALUE'] = row['sex']
    df_person.loc[idx, 'RACE_SOURCE_VALUE'] = row['race']
    df_person.loc[idx, 'RACE_SOURCE_CONCEPT_ID'] = 8515
                  
    idx = idx + 1
    
# Save the results
df_person.to_csv('INSPIRE_ETL/INSPIRE_PERSON.csv')

100%|██████████| 131109/131109 [17:19<00:00, 126.18it/s]


In [8]:
len(unique_ids)

99900

In [7]:
# Find unique subject IDs in df_op that are not already in df_person
unique_ids = df_op['subject_id'].unique()
df_person = pd.DataFrame({'PERSON_ID':np.arange(1, len(unique_ids) + 1), 'subject_id': unique_ids})



df_person = df_person.merge(df_op[['subject_id', 'sex']], on='subject_id')
df_person

Unnamed: 0,PERSON_ID,subject_id,sex
0,1,178742874,F
1,2,158995752,M
2,2,158995752,M
3,2,158995752,M
4,3,108553242,F
...,...,...,...
131104,99896,163862002,M
131105,99897,126772283,F
131106,99898,144363433,F
131107,99899,195835964,M


In [6]:
df_person

Unnamed: 0,PERSON_ID,subject_id
0,1,178742874
1,2,158995752
2,3,108553242
3,4,133278262
4,5,116924034
...,...,...
99895,99896,163862002
99896,99897,126772283
99897,99898,144363433
99898,99899,195835964


In [4]:
np.arange(1,10)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [53]:
df_person

Unnamed: 0,PERSON_ID,GENDER_CONCEPT_ID,YEAR_OF_BIRTH,MONTH_OF_BIRTH,DAY_OF_BIRTH,BIRTH_DATETIME,RACE_CONCEPT_ID,ETHNICITY_CONCEPT_ID,LOCATION_ID,PROVIDER_ID,CARE_SITE_ID,PERSON_SOURCE_VALUE,GENDER_SOURCE_VALUE,GENDER_SOURCE_CONCEPT_ID,RACE_SOURCE_VALUE,RACE_SOURCE_CONCEPT_ID,ETHNICITY_SOURCE_VALUE,ETHNICITY_SOURCE_CONCEPT_ID,base_time
0,153073110,8532,1963,,,1963-12-30 00:00:00,8515,,,,,153073110,F,,Asian,,,,2013-12-30 00:00:00
1,134213281,8532,1956,,,1956-06-13 00:00:00,8515,,,,,134213281,F,,Asian,,,,2016-06-13 00:00:00
2,134195201,8532,1984,,,1984-04-28 00:00:00,8515,,,,,134195201,F,,Asian,,,,2019-04-28 00:00:00
3,163619571,8532,1935,,,1935-06-26 00:00:00,8515,,,,,163619571,F,,Asian,,,,2010-06-26 00:00:00
4,132888590,8532,1953,,,1953-12-14 00:00:00,8515,,,,,132888590,F,,Asian,,,,2013-12-14 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39792,124555170,8532,1968,,,1968-01-31 00:00:00,8515,,,,,124555170,F,,Asian,,,,2013-01-31 00:00:00
39793,145733500,8532,1972,,,1972-04-05 00:00:00,8515,,,,,145733500,F,,Asian,,,,2012-04-05 00:00:00
39794,179496761,8532,1955,,,1955-09-30 00:00:00,8515,,,,,179496761,F,,Asian,,,,2015-09-30 00:00:00
39795,118074180,8507,1951,,,1951-05-30 00:00:00,8515,,,,,118074180,M,,Asian,,,,2016-05-30 00:00:00


# OBSERVATION_PERIOD

* Definition of OBSERVATION_PERIOD record : 각 subject_id 마다 첫 입원 시점부터 operations.csv, vitals.csv, ward_vitals.csv, labs.csv, medications.csv 에 기록된 가장 마지막 시점까지를 observation-period record로 정의 (즉, 한 사람 당 하나의 record)

    * 다른 가능한 정의) 하나의 subject_id에 대해 각 hadm_id를 하나의 Clinical Event로 정의함. 이 Clinical Event의 시작 시점과 끝 시점은 operations.csv

In [67]:
from tqdm import tqdm
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta


# load PERSON table
df_person = pd.read_csv('INSPIRE_ETL/INSPIRE_PERSON.csv')

# columns for OBSERVATION_PERIOD
columns = ['OBSERVATION_PERIOD_ID',
 'PERSON_ID',
 'OBSERVATION_PERIOD_START_DATE',
 'OBSERVATION_PERIOD_END_DATE',
 'PERIOD_TYPE_CONCEPT_ID']

# create a new OBSERVATION_PERIOD table (clinical data table)
df_obs_period = pd.DataFrame(columns=columns)
obs_id = 1

# second match: operations table (source) -> OBSERVATION_PERIOD table (CDM V5.4)
for idx, row in tqdm(df_person.iterrows(), total = df_person.shape[0]):
    ### OBSERVATION_PERIOD_ID
    df_obs_period.loc[idx, 'OBSERVATION_PERIOD_ID'] = obs_id
    obs_id = obs_id + 1
   
    
    ### PERSON_ID ###
    # Identical with the source data.
    subject_id = row['PERSON_ID']
    df_obs_period.loc[idx, 'PERSON_ID'] = subject_id
    
    
    ### OBSERVATION_PERIOD_START_DATE, OBSERVATION_PERIOD_END_DATE
    subject_op = df_op[df_op['subject_id'] == subject_id]
    # base_time is time_zero for each subject
    time_zero = datetime.strptime(row['base_time'], '%Y-%m-%d %H:%M:%S')
    
    # period_start : time_zero
    df_obs_period.loc[idx, 'OBSERVATION_PERIOD_START_DATE'] = time_zero
    
    # period_end : the latest discharge_time 
    # since discharge_time is recorded in days, input as the time of 23:56 of the respective day
    df_obs_period.loc[idx, 'OBSERVATION_PERIOD_END_DATE'] = time_zero + timedelta(minutes = int(subject_op['discharge_time'].max()))
    
    
    ### PERIOD_TYPE_CONCEPT_ID : EHR admission note (https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT)
    df_obs_period.loc[idx, 'PERIOD_TYPE_CONCEPT_ID'] = 32819

# Save the results
df_obs_period.to_csv('INSPIRE_ETL/INSPIRE_OBSERVATION_PERIOD.csv')

100%|██████████| 39797/39797 [00:55<00:00, 722.72it/s] 


### multi-processing

In [136]:
import pandas as pd
import numpy as np
from datetime import timedelta
from multiprocessing import Pool
from tqdm import tqdm
import threading

# Define your dataframe 'columns' and other dataframes here
# columns for PERSON
columns = ['OBSERVATION_PERIOD_ID',
 'PERSON_ID',
 'OBSERVATION_PERIOD_START_DATE',
 'OBSERVATION_PERIOD_END_DATE',
 'PERIOD_TYPE_CONCEPT_ID']

# create a new PERSON table (clinical data table)
df_obs_period = pd.DataFrame(columns=columns)
obs_id = 1



def process_row(args):
    idx, row = args
    global obs_id
    
    df_obs_period = pd.DataFrame(columns=columns)
    ### OBSERVATION_PERIOD_ID
    with obs_id_lock:
        df_obs_period.loc[idx, 'OBSERVATION_PERIOD_ID'] = obs_id
        obs_id = obs_id + 1
        
    ### PERSON_ID ###
    # Identical with the source data.
    subject_id = row['PERSON_ID']
    df_obs_period.loc[idx, 'PERSON_ID'] = subject_id

    ### OBSERVATION_PERIOD_START_DATE, OBSERVATION_PERIOD_END_DATE
    rows_subject = df_op[df_op['subject_id'] == subject_id]
    
    # admission_date (period_start) = base_date - op_start
    op_start = rows_subject.iloc[0, 15]
    
    # period_end : 있는 기록 중에 가장 오래된 시점 (diagnosis.csv 제외) 
    period_end = np.nanmax(row_subject.iloc[:, 15:27].values.flatten())
    time_max = pd.concat([df[df['subject_id'] == subject_id]['chart_time'] for df in list_df]).max()
    period_end = time_max if time_max > period_end else period_end
    
    df_obs_period.loc[idx, 'OBSERVATION_PERIOD_START_DATE'] = row['base_time'] - timedelta(minutes=int(op_start))
    df_obs_period.loc[idx, 'OBSERVATION_PERIOD_END_DATE'] = row['base_time'] + timedelta(minutes=int(period_end))

    ### PERIOD_TYPE_CONCEPT_ID : EHR admission note (https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT)
    df_obs_period.loc[idx, 'PERIOD_TYPE_CONCEPT_ID'] = 32819
    
    return df_obs_period

# Create a lock for the shared 'obs_id' variable
obs_id_lock = threading.Lock()
obs_id = 1  # Initialize obs_id

if __name__ == "__main__":
    # Use multiprocessing Pool for parallel processing
    num_processes = 32  # Adjust this based on your system's capabilities
    with Pool(num_processes) as pool:
        rows_to_process = [(idx, row) for idx, row in df_person[:100].iterrows()]
        df = list(tqdm(pool.imap(process_row, rows_to_process), total=len(rows_to_process)))
        df_obs_period = pd.concat(df)
    # Save the results
    #df_obs_period.to_csv('INSPIRE_ETL/INSPIRE_OBSERVATION_PERIOD.csv')


100%|██████████| 100/100 [00:00<00:00, 121.44it/s]


In [None]:
from multiprocessing import Pool
import multiprocessing
from tqdm import tqdm
import threading, random

random.seed(98)

# INSPIRE csv
list_df = [df_labs, df_medi, df_vitals, df_ward]
list_name = ['labs', 'medications', 'vitals', 'ward']

# columns for PERSON
columns = ['OBSERVATION_PERIOD_ID',
 'PERSON_ID',
 'OBSERVATION_PERIOD_START_DATE',
 'OBSERVATION_PERIOD_END_DATE',
 'PERIOD_TYPE_CONCEPT_ID']


# Function to process a single row
def process_row(row, list_df):
    obs_id = row['index'] + 1
    
    ### OBSERVATION_PERIOD_START_DATE, OBSERVATION_PERIOD_END_DATE
    rows_subject = df_op[df_op['subject_id'] == subject_id]
    
    # admission_date (period_start) = base_date - op_start
    op_start = rows_subject.iloc[0, 15]
    
    # period_end : 있는 기록 중에 가장 오래된 시점 (diagnosis.csv 제외) 
    period_end = np.nanmax(row_subject.iloc[:, 15:27].values.flatten())
    time_max = pd.concat([df[df['subject_id'] == subject_id]['chart_time'] for df in list_df]).max()
    period_end = time_max if time_max > period_end else period_end
    
    start_date = row['base_time'] - timedelta(minutes=int(op_start))
    end_date = row['base_time'] + timedelta(minutes=int(period_end))
    
    return {
        'OBSERVATION_PERIOD_ID': obs_id,
        'PERSON_ID': row['PERSON_ID'],
        'OBSERVATION_PERIOD_START_DATE': start_date,
        'OBSERVATION_PERIOD_END_DATE': end_date,
        'PERIOD_TYPE_CONCEPT_ID': 32819  # Modify this as needed
    }

# Function to process rows in parallel
def process_rows_parallel(args):
    df_chunk, list_df = args
    results = [process_row(row, list_df) for _, row in tqdm(df_chunk.iterrows(), total=len(df_chunk))]
    return results

num_processes = 4
# Split the df_person DataFrame into chunks for parallel processing
chunk_size = len(df_person) // num_processes
chunks = [df_person[i:i + chunk_size] for i in range(0, len(df_person), chunk_size)]

print('start')
# Create a multiprocessing Pool and process the chunks in parallel
with multiprocessing.Pool(processes=num_processes) as pool:
    args = [(chunk, list_df) for chunk in chunks]
    results = pool.map(process_rows_parallel, args)


In [137]:
df_obs_period

Unnamed: 0,OBSERVATION_PERIOD_ID,PERSON_ID,OBSERVATION_PERIOD_START_DATE,OBSERVATION_PERIOD_END_DATE,PERIOD_TYPE_CONCEPT_ID
0,1,153073110,2013-12-28 10:15:00,2014-01-10 09:00:00,32819
1,1,134213281,2016-06-11 13:45:00,2025-11-26 13:05:00,32819
2,1,134195201,2019-04-26 08:30:00,2019-05-15 12:15:00,32819
3,1,163619571,2010-06-25 15:00:00,2010-07-10 10:05:00,32819
4,1,132888590,2013-12-13 12:20:00,2024-05-11 08:25:00,32819
...,...,...,...,...,...
95,4,138385750,2013-02-18 09:35:00,2023-04-04 10:45:00,32819
96,3,170529270,2010-01-11 16:00:00,2018-05-15 08:55:00,32819
97,4,166857230,2016-01-31 01:10:00,2016-02-15 11:00:00,32819
98,3,135312720,2010-09-03 10:40:00,2012-08-10 04:45:00,32819


# VISIT_OCCURRENCE

* 환자가 수술을 위한 입원 외에는 방문한 경우가 없다

In [100]:
# Load PERSON clinial data table
df_person = pd.read_csv('INSPIRE_ETL/INSPIRE_PERSON.csv')

# Define the dataframe for VISIT_OCCURRENCE table
columns = ['VISIT_OCCURRENCE_ID',
 'PERSON_ID',
 'VISIT_CONCEPT_ID',
 'VISIT_START_DATE',
 'VISIT_START_DATETIME',
 'VISIT_END_DATE',
 'VISIT_END_DATETIME',
 'VISIT_TYPE_CONCEPT_ID',
 'PROVIDER_ID',
 'CARE_SITE_ID',
 'VISIT_SOURCE_VALUE',
 'VISIT_SOURCE_CONCEPT_ID',
 'ADMITTING_SOURCE_CONCEPT_ID',
 'ADMITTING_SOURCE_VALUE',
 'DISCHARGE_TO_CONCEPT_ID',
 'DISCHARGE_TO_SOURCE_VALUE',
 'PRECEDING_VISIT_OCCURRENCE_ID']

df_visit_occ = pd.DataFrame(columns = columns)

# third match:  (source) -> VISIT_OCCURRENCE table (CDM V5.4)
n_visit = 0
for idx, row in tqdm(df_person.iterrows(), total = df_person.shape[0]):
    # operations table for subject_id
    subject_id = row['PERSON_ID']
    subject_op = df_op[df_op['subject_id'] == subject_id]
    
    
    hadm_cnt = 0
    # define visit for each hadm_id
    for _, row_adm in subject_op[['hadm_id', 'admission_time', 'discharge_time']].groupby('hadm_id', sort=False).agg(['min', 'max']).iterrows():
        df_visit_occ.loc[n_visit, 'VISIT_OCCURRENCE_ID'] = n_visit + 1
        df_visit_occ.loc[n_visit, 'PERSON_ID'] = row['PERSON_ID']
        
        
        ### VISIT_CONCEPT_ID : Inpatient visit (ID 9201)
        df_visit_occ.loc[n_visit, 'VISIT_CONCEPT_ID'] = 9201
        
        
        ### VISIT_START_DATE, VISIT_END_DATE
        # base_time is time_zero for each subject        
        time_zero = datetime.strptime(row['base_time'], '%Y-%m-%d %H:%M:%S')
        df_visit_occ.loc[n_visit, 'VISIT_START_DATE'] = time_zero + timedelta(minutes = int(row_adm['admission_time', 'min']))
        df_visit_occ.loc[n_visit, 'VISIT_START_DATETIME'] = time_zero + timedelta(minutes = int(row_adm['admission_time', 'min']))
        df_visit_occ.loc[n_visit, 'VISIT_END_DATE'] = time_zero + timedelta(minutes = int(row_adm['discharge_time', 'max']) + 60*24)
        df_visit_occ.loc[n_visit, 'VISIT_END_DATETIME'] = time_zero + timedelta(minutes = int(row_adm['discharge_time', 'max']) + 60*24)
    
    
        ### VISIT_TYPE_CONCEPT_ID : EHR admission note (https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT)
        df_visit_occ.loc[n_visit, 'VISIT_TYPE_CONCEPT_ID'] = 32819        
        
        df_visit_occ.loc[n_visit, 'PROVIDER_ID'] = 0
        df_visit_occ.loc[n_visit, 'CARE_SITE_ID'] = 1
        
        
        ### VISIT_SOURCE_, ADMITTED_FROM_, DISCHARGED_TO_ are not code
        ### PRECEDING_VISIT_OCCURRENCE_ID
        df_visit_occ.loc[n_visit, 'PRECEDING_VISIT_OCCURRENCE_ID'] = df_visit_occ.loc[n_visit-1, 'VISIT_OCCURRENCE_ID'] if hadm_cnt > 0 else np.nan

        hadm_cnt = hadm_cnt + 1
        n_visit = n_visit + 1

# save the results
df_visit_occ.to_csv('INSPIRE_ETL/INSPIRE_VISIT_OCCURRENCE.csv')

100%|██████████| 39797/39797 [06:06<00:00, 108.55it/s]


In [98]:
df_visit_occ

Unnamed: 0,VISIT_OCCURRENCE_ID,PERSON_ID,VISIT_CONCEPT_ID,VISIT_START_DATE,VISIT_START_DATETIME,VISIT_END_DATE,VISIT_END_DATETIME,VISIT_TYPE_CONCEPT_ID,PROVIDER_ID,CARE_SITE_ID,VISIT_SOURCE_VALUE,VISIT_SOURCE_CONCEPT_ID,ADMITTING_SOURCE_CONCEPT_ID,ADMITTING_SOURCE_VALUE,DISCHARGE_TO_CONCEPT_ID,DISCHARGE_TO_SOURCE_VALUE,PRECEDING_VISIT_OCCURRENCE_ID,preceding_visit_occurrence_id
0,1,153073110,9201,2013-12-30 00:00:00,2013-12-30 00:00:00,2014-01-11 00:00:00,2014-01-11 00:00:00,32819,0,1,,,,,,,,
1,2,134213281,9201,2016-06-13 00:00:00,2016-06-13 00:00:00,2016-06-18 00:00:00,2016-06-18 00:00:00,32819,0,1,,,,,,,,
2,3,134195201,9201,2019-04-28 00:00:00,2019-04-28 00:00:00,2019-05-04 00:00:00,2019-05-04 00:00:00,32819,0,1,,,,,,,,
3,4,163619571,9201,2010-06-26 00:00:00,2010-06-26 00:00:00,2010-06-27 00:00:00,2010-06-27 00:00:00,32819,0,1,,,,,,,,
4,5,163619571,9201,2010-07-10 00:00:00,2010-07-10 00:00:00,2010-07-11 00:00:00,2010-07-11 00:00:00,32819,0,1,,,,,,,,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50463,50464,124555170,9201,2013-01-31 00:00:00,2013-01-31 00:00:00,2013-02-04 00:00:00,2013-02-04 00:00:00,32819,0,1,,,,,,,,
50464,50465,145733500,9201,2012-04-05 00:00:00,2012-04-05 00:00:00,2012-04-10 00:00:00,2012-04-10 00:00:00,32819,0,1,,,,,,,,
50465,50466,179496761,9201,2015-09-30 00:00:00,2015-09-30 00:00:00,2015-10-06 00:00:00,2015-10-06 00:00:00,32819,0,1,,,,,,,,
50466,50467,118074180,9201,2016-05-30 00:00:00,2016-05-30 00:00:00,2016-06-07 00:00:00,2016-06-07 00:00:00,32819,0,1,,,,,,,,


# VISIT_DETAIL

* **Table Description**  
  The VISIT_DETAIL table is an optional table used to represents details of each record in the parent VISIT_OCCURRENCE table. A good example of this would be the movement between units in a hospital during an inpatient stay or claim lines associated with a one insurance claim. For every record in the VISIT_OCCURRENCE table there may be 0 or more records in the VISIT_DETAIL table with a 1:n relationship where n may be 0. The VISIT_DETAIL table is structurally very similar to VISIT_OCCURRENCE table and belongs to the visit domain.

In [None]:
# Define the dataframe for VISIT_DETAIL table
columns = ['VISIT_DETAIL_ID',
 'PERSON_ID',
 'VISIT_DETAIL_CONCEPT_ID',
 'VISIT_DETAIL_START_DATE',
 'VISIT_DETAIL_START_DATETIME',
 'VISIT_DETAIL_END_DATE',
 'VISIT_DETAIL_END_DATETIME',
 'VISIT_DETAIL_TYPE_CONCEPT_ID',
 'PROVIDER_ID',
 'CARE_SITE_ID',
 'VISIT_DETAIL_SOURCE_VALUE',
 'VISIT_DETAIL_SOURCE_CONCEPT_ID',
 'ADMITTING_SOURCE_VALUE',
 'ADMITTING_SOURCE_CONCEPT_ID',
 'DISCHARGE_TO_SOURCE_VALUE',
 'DISCHARGE_TO_CONCEPT_ID',
 'PRECEDING_VISIT_DETAIL_ID',
 'VISIT_DETAIL_PARENT_ID',
 'VISIT_OCCURRENCE_ID']

df_visit_detail = pd.DataFrame(columns = columns)

# Load PERSON clinial data table
df_visit_occ = pd.read_csv('INSPIRE_ETL/INSPIRE_VISIT_OCCURRENCE.csv')

# fourth match:  (source) -> VISIT_DETAIL table (CDM V5.4)
for idx, row in tqdm(df_visit_occ.iterrows(), total=df_visit_occ.shape[0]):
    # Visit detail is equal with vist defined in VISIT_OCCURRENCE table
    df_visit_detail.loc[idx, 'VISIT_DETAIL_ID'] = row['VISIT_OCCURRENCE_ID']
    df_visit_detail.loc[idx, 'PERSON_ID'] = row['PERSON_ID']
    
    
    ### VISIT_DETAIL_CONCEPT_ID : Inpatient visit (ID 9201)
    df_visit_detail.loc[idx, 'VISIT_DETAIL_CONCEPT_ID'] = 9201
    
    
    ### VISIT_DETAIL_START_DATE, END_DATE : same as VISIT_OCCURRENCE table
    df_visit_detail.loc[idx, 'VISIT_DETAIL_START_DATE'] = row['VISIT_START_DATE']
    df_visit_detail.loc[idx, 'VISIT_DETAIL_START_DATETIME'] = row['VISIT_START_DATETIME']
    df_visit_detail.loc[idx, 'VISIT_DETAIL_END_DATE'] = row['VISIT_END_DATE']
    df_visit_detail.loc[idx, 'VISIT_DETAIL_END_DATETIME'] = row['VISIT_END_DATETIME']
    
    
    ### VISIT_TYPE_CONCEPT_ID : EHR admission note (https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT)
    df_visit_detail.loc[idx, 'VISIT_DETAIL_TYPE_CONCEPT_ID'] = 32819      
    
    
    df_visit_detail.loc[idx, 'PROVIDER_ID'] = 0
    df_visit_detail.loc[idx, 'PRECEDING_VISIT_DETAIL_ID'] = row['PRECEDING_VISIT_OCCURRENCE_ID']
    df_visit_detail.loc[idx, 'VISIT_OCCURRENCE_ID'] = row['VISIT_OCCURRENCE_ID']
    
# save the results
df_visit_detail.to_csv('INSPIRE_ETL/INSPIRE_VISIT_DETAIL.csv')

In [None]:
df_visit_detail

# CONDITION_OCCURRENCE

In [2]:
import pandas as pd

# columns of PERSON
df_hira = pd.read_csv('sample/HIRA/HIRA_CONDITION_OCCURRENCE_v1.csv')
columns = list(df_hira.columns)
columns

['CONDITION_OCCURRENCE_ID',
 'PERSON_ID',
 'CONDITION_CONCEPT_ID',
 'CONDITION_START_DATE',
 'CONDITION_START_DATETIME',
 'CONDITION_END_DATE',
 'CONDITION_END_DATETIME',
 'CONDITION_TYPE_CONCEPT_ID',
 'CONDITION_STATUS_CONCEPT_ID',
 'STOP_REASON',
 'PROVIDER_ID',
 'VISIT_OCCURRENCE_ID',
 'VISIT_DETAIL_ID',
 'CONDITION_SOURCE_VALUE',
 'CONDITION_SOURCE_CONCEPT_ID',
 'CONDITION_STATUS_SOURCE_VALUE']

In [3]:
df_concept = 
df_diag

Unnamed: 0,subject_id,chart_time,icd10_cm
0,190852492,325440.0,R06
1,190852492,325440.0,G20
2,142367193,0.0,I61
3,142367193,658080.0,E11
4,142367193,658080.0,G44
...,...,...,...
432342,115653810,28800.0,K30
432343,115653810,28800.0,R94
432344,115653810,36000.0,H66
432345,115653810,38880.0,H66


In [None]:
# Define the dataframe for CONDITION_OCCURRENCE table
columns = ['CONDITION_OCCURRENCE_ID',
 'PERSON_ID',
 'CONDITION_CONCEPT_ID',
 'CONDITION_START_DATE',
 'CONDITION_START_DATETIME',
 'CONDITION_END_DATE',
 'CONDITION_END_DATETIME',
 'CONDITION_TYPE_CONCEPT_ID',
 'CONDITION_STATUS_CONCEPT_ID',
 'STOP_REASON',
 'PROVIDER_ID',
 'VISIT_OCCURRENCE_ID',
 'VISIT_DETAIL_ID',
 'CONDITION_SOURCE_VALUE',
 'CONDITION_SOURCE_CONCEPT_ID',
 'CONDITION_STATUS_SOURCE_VALUE']

df_condition_occ = pd.DataFrame(columns = columns)

# load PERSON table
df_person = pd.read_csv('INSPIRE_ETL/INSPIRE_PERSON.csv')

# Load CONCEPT table to map source vocab into Standard Concept ID
df_concept = pd.read_csv(f'vocab/CONCEPT.csv', sep='\t')

# fifth match:  (source) -> CONDITION_OCCURRENCE table (CDM V5.4)
for idx, row in tqdm(df_person.iterrows(), total = df_person.shape[0]):
    subject_id = df_PERSON['PERSON_ID']
    
    subject_diag = df_diag[df_diag['subject_id'] == subject_id]
    
    for _, row_diag in subject_diag.iterrows():
        
        df_condition_occ['CONDITION_CONCEPT_ID'] = df_concept[(df_concept['vocabulary_id']=='ICD10CM') & (df_concept['concept_code']==row_diag['icd10_cm'])]

# DRUG_EXPOSURE

# MEASUREMENT

# OBSERVATION