# Loading INSPIRE dataset

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime

# INSPIRE v0.2 (about 65,000 cases, 50% of the surgical cases)
# Define the path to the INSPIRE v2 dataset
input_path = 'inspire_v2/mapped'

# Load the source tables within INSPIRE v2 into dataframes
df_diag = pd.read_csv(f'{input_path}/diagnosis.csv')         # Load diagnosis data
df_labs = pd.read_csv(f'{input_path}/labs.csv')              # Load labs data
df_medi = pd.read_csv(f'{input_path}/medications.csv')       # Load medications data
df_op = pd.read_csv(f'{input_path}/operations.csv')          # Load operations data
df_vitals = pd.read_csv(f'{input_path}/vitals.csv')          # Load vitals data
df_ward = pd.read_csv(f'{input_path}/ward_vitals.csv')       # Load ward vitals data
df_params = pd.read_csv(f'{input_path}/parameters_mapped.csv') 


# Display the number of records in each dataset
print(f'Size of the tables: operations {len(df_op)}, diagnosis {len(df_diag)}, labs {len(df_labs)}, medications {len(df_medi)}, vitals {len(df_vitals)}, ward_vitals {len(df_ward)}')

# Combine all the subject_ids from the loaded datasets
subject_ids = df_diag['subject_id'].tolist() + df_labs['subject_id'].tolist() + df_medi['subject_id'].tolist() + df_op['subject_id'].tolist() + df_vitals['subject_id'].tolist() + df_ward['subject_id'].tolist()

# Display the total unique subjects present in the combined dataset
print(f'total subjects in INSPIRE dataset: {len(np.unique(subject_ids))}')
print(f"total subjects in operations.csv: {len(np.unique(df_op['subject_id']))}")

Size of the tables: operations 131109, diagnosis 4733046, labs 21367131, medications 6952960, vitals 64443624, ward_vitals 42679760
total subjects in INSPIRE dataset: 101469
total subjects in operations.csv: 99900


In [92]:
# Load the CONCEPT_RELATIONSHIP table with tab as a delimiter and error handling for bad lines
df_concept_rel = pd.read_csv(f'vocab/CONCEPT_RELATIONSHIP.csv', sep='\t', on_bad_lines='error')
# LOAD the CONCEPT table with tab as a delimiter
df_concept = pd.read_csv(f'vocab/CONCEPT.csv', sep='\t')

  df_concept = pd.read_csv(f'vocab/CONCEPT.csv', sep='\t')


# PERSON

In [5]:
# start_index for each table_id
start_index = {
    'person': 1000000,
    'observation_period': 2000000,
    'visit_occurrence': 3000000,
    'visit_detail': 4000000,
    'condition_occurrence': 5000000,
    'drug_exposure': 6000000,
    'procedure_occurrence': 7000000,
    'measurement': 8000000,
    'death': 9000000,
    'note': 10000000,
    'location': 20000000 
}

In [44]:
# Create an empty dataframe with defined columns for PERSON table
df_person = pd.DataFrame(columns=['PERSON_ID'])

# Assign unique IDs to each distinct 'subject_id' from the operations data
unique_ids = df_op['subject_id'].unique()
df_person['PERSON_ID'] = start_index['person'] + np.arange(1, len(unique_ids) + 1)
df_person['subject_id'] = unique_ids

# Merge relevant columns from the operations dataframe with the PERSON dataframe based on 'subject_id'
usecols = ['subject_id', 'age', 'sex', 'race']
df_person = df_person.merge(df_op[usecols], on = 'subject_id')
# Ensure only the latest discharge_time is retained for each unique PERSON_ID
df_person.drop_duplicates(subset = 'PERSON_ID', keep = 'last', inplace = True, ignore_index = True)

# Map gender values ('M' or 'F') to corresponding GENDER_CONCEPT_ID values
df_person['GENDER_CONCEPT_ID'] = df_person['sex'].map({'M': 8507, 'F': 8532}, na_action='ignore')

# Remove any rows with missing gender values
df_person.dropna(subset=['GENDER_CONCEPT_ID'])

# Set the first date to 2011.01.01 since the exact year is not specified
start_date = datetime(2011, 1, 1)

# Calculate and assign the year of birth based on age and the start date
df_person['YEAR_OF_BIRTH'] = start_date.year - df_person['age']
# Compute the exact birth datetime using age and start date
df_person['BIRTH_DATETIME'] = pd.to_datetime(start_date) - pd.to_timedelta(df_person['age']*365.25, unit = 'days')

# Set RACE_CONCEPT_ID to indicate all individuals are ASIAN
df_person['RACE_CONCEPT_ID'] = 8515

# Assign value for LOCATION_ID (1: INSPIRE)
df_person['LOCATION_ID'] = 1

# Populate source value columns based on values from the operations data
df_person['PERSON_SOURCE_VALUE'] = df_person['subject_id']
df_person['GENDER_SOURCE_VALUE'] = df_person['sex']
df_person['RACE_SOURCE_VALUE'] = df_person['race']
df_person['RACE_SOURCE_CONCEPT_ID'] = 8515

# Remove columns that aren't part of the final PERSON table format
df_person.drop(columns=usecols, inplace=True)

# Write the processed data to a CSV file
df_person.to_csv('INSPIRE_ETL/INSPIRE_PERSON.csv', index=False)

In [7]:
df_person

Unnamed: 0,PERSON_ID,GENDER_CONCEPT_ID,YEAR_OF_BIRTH,BIRTH_DATETIME,RACE_CONCEPT_ID,LOCATION_ID,PERSON_SOURCE_VALUE,GENDER_SOURCE_VALUE,RACE_SOURCE_VALUE,RACE_SOURCE_CONCEPT_ID
0,1000001,8532,1981,1980-12-31 12:00:00,8515,1,178742874,F,Asian,8515
1,1000002,8507,1941,1940-12-31 12:00:00,8515,1,158995752,M,Asian,8515
2,1000003,8532,1956,1956-01-01 06:00:00,8515,1,108553242,F,Asian,8515
3,1000004,8532,1976,1976-01-01 06:00:00,8515,1,133278262,F,Asian,8515
4,1000005,8532,1966,1965-12-31 18:00:00,8515,1,116924034,F,Asian,8515
...,...,...,...,...,...,...,...,...,...,...
99895,1099896,8507,1926,1925-12-31 18:00:00,8515,1,163862002,M,Asian,8515
99896,1099897,8532,1941,1940-12-31 12:00:00,8515,1,126772283,F,Asian,8515
99897,1099898,8532,1946,1945-12-31 18:00:00,8515,1,144363433,F,Asian,8515
99898,1099899,8507,1926,1925-12-31 18:00:00,8515,1,195835964,M,Asian,8515


# OBSERVATION_PERIOD

* Definition of OBSERVATION_PERIOD record : 각 subject_id 마다 첫 입원 시점부터 operations.csv, vitals.csv, ward_vitals.csv, labs.csv, medications.csv 에 기록된 가장 마지막 시점까지를 observation-period record로 정의 (즉, 한 사람 당 하나의 record)

    * 다른 가능한 정의) 하나의 subject_id에 대해 각 hadm_id를 하나의 Clinical Event로 정의함. 이 Clinical Event의 시작 시점과 끝 시점은 operations.csv

In [8]:
# Load the previously created PERSON table
#df_person = pd.read_csv('INSPIRE_ETL/INSPIRE_PERSON.csv')

# Create an empty dataframe with specified columns for OBSERVATION_PERIOD table
df_obs = pd.DataFrame(columns=['OBSERVATION_PERIOD_ID'])

# Copy PERSON_ID from PERSON table to OBSERVATION_PERIOD_ID and PERSON_ID columns in OBSERVATION_PERIOD table
df_obs['OBSERVATION_PERIOD_ID'] =  start_index['observation_period'] - start_index['person'] + df_person['PERSON_ID']
df_obs['PERSON_ID'] = df_person['PERSON_ID']
# Copy PERSON_SOURCE_VALUE from PERSON table to subject_id in OBSERVATION_PERIOD table for merging purposes
df_obs['subject_id'] = df_person['PERSON_SOURCE_VALUE']

# Define the base date for the observation period
base_date = datetime(2011, 1, 1)

# Merge the 'discharge_time' column from the operations dataframe to the OBSERVATION_PERIOD table
df_obs = df_obs.merge(df_op[['subject_id', 'discharge_time']], on = 'subject_id', how = 'left')
# Retain only the latest 'discharge_time' for each subject
df_obs.drop_duplicates(subset='subject_id', keep='last', inplace=True, ignore_index=True)

# Aggregate the maximum chart time across all source tables (diagnosis, labs, medications, vitals, ward)
# This helps in determining the end of the observation period for each subject
df_time = df_obs[['subject_id', 'discharge_time']].merge(df_diag[['subject_id', 'chart_time']], on='subject_id', how = 'left').drop_duplicates(subset='subject_id', keep = 'last', inplace = False, ignore_index = True)
df_time = df_time.merge(df_labs[['subject_id', 'chart_time']], on = 'subject_id', how = 'left', suffixes=("_diag", "_labs")).drop_duplicates(subset='subject_id', keep = 'last', inplace = False, ignore_index = True)
df_time = df_time.merge(df_medi[['subject_id', 'chart_time']], on = 'subject_id', how = 'left', suffixes=(None, "_medi")).drop_duplicates(subset='subject_id', keep = 'last', inplace = False, ignore_index = True)
df_time = df_time.merge(df_vitals[['subject_id', 'chart_time']], on = 'subject_id', how = 'left', suffixes=(None, "_vitals")).drop_duplicates(subset='subject_id', keep = 'last', inplace = False, ignore_index = True)
df_time = df_time.merge(df_ward[['subject_id', 'chart_time']], on = 'subject_id', how = 'left', suffixes=(None, "_ward")).drop_duplicates(subset='subject_id', keep = 'last', inplace = False, ignore_index = True)
df_time['max_time'] = df_time.iloc[:,1:].max(axis=1)

# Set the OBSERVATION_PERIOD_START_DATE to the base date
df_obs['OBSERVATION_PERIOD_START_DATE'] = pd._to_datetime(base_date)
# Calculate and set the OBSERVATION_PERIOD_END_DATE using the base date and the aggregated maximum time
df_obs['OBSERVATION_PERIOD_END_DATE'] = pd.to_datetime(base_date) + pd.to_timedelta(df_time['max_time'], unit='min')
# Convert the OBSERVATION_PERIOD_END_DATE to just date format (remove time)
df_obs['OBSERVATION_PERIOD_END_DATE'] = pd.to_datetime(df_obs['OBSERVATION_PERIOD_END_DATE'].dt.date)

# Assign the PERIOD_TYPE_CONCEPT_ID indicating the data source is an EHR since it is not specified
df_obs['PERIOD_TYPE_CONCEPT_ID'] = 32817

# Remove columns that aren't part of the final OBSERVATION_PERIOD table format
df_obs.drop(columns=['discharge_time', 'subject_id'], inplace=True)

# Write the processed data to a CSV file
df_obs.to_csv('INSPIRE_ETL/INSPIRE_OBSERVATION_PERIOD.csv', index=False)

In [9]:
df_obs

Unnamed: 0,OBSERVATION_PERIOD_ID,PERSON_ID,OBSERVATION_PERIOD_START_DATE,OBSERVATION_PERIOD_END_DATE,PERIOD_TYPE_CONCEPT_ID
0,2000001,1000001,2011-01-01,2016-01-21,32817
1,2000002,1000002,2011-01-01,2011-08-03,32817
2,2000003,1000003,2011-01-01,2012-04-26,32817
3,2000004,1000004,2011-01-01,2011-01-04,32817
4,2000005,1000005,2011-01-01,2015-01-24,32817
...,...,...,...,...,...
99895,2099896,1099896,2011-01-01,2011-04-16,32817
99896,2099897,1099897,2011-01-01,2011-05-06,32817
99897,2099898,1099898,2011-01-01,2011-01-08,32817
99898,2099899,1099899,2011-01-01,2011-05-24,32817


# VISIT_OCCURRENCE

* hadm_id 단위로 visit으로 정의

In [35]:
# Initialize an empty DataFrame for VISIT_OCCURRENCE with predefined columns
df_visit_occ = pd.DataFrame(columns=['VISIT_OCCURRENCE_ID'])

# Load the PERSON table data from the CSV file
df_person = pd.read_csv('INSPIRE_ETL/INSPIRE_PERSON.csv')

# Copy PERSON_ID values from df_person to df_visit_occ
df_visit_occ['PERSON_ID'] = df_person['PERSON_ID']

# Copy PERSON_SOURCE_VALUE values (as subject_id) from df_person to df_visit_occ
df_visit_occ['subject_id'] = df_person['PERSON_SOURCE_VALUE']

# Merge visit_occurrence data with operation data based on 'subject_id'
usecols = ['hadm_id', 'subject_id', 'admission_time', 'discharge_time']
df_visit_occ = df_visit_occ.merge(df_op[usecols], on = 'subject_id', how = 'left')

# Remove duplicate entries based on 'hadm_id', keeping only the first occurrence
df_visit_occ.drop_duplicates(subset=['hadm_id'], keep='first', inplace=True, ignore_index=True)

# Assign sequential IDs starting from 1 to VISIT_OCCURRENCE_ID column
df_visit_occ['VISIT_OCCURRENCE_ID'] = start_index['visit_occurrence'] + np.arange(len(df_visit_occ)) + 1

# Set a default value for VISIT_CONCEPT_ID
df_visit_occ['VISIT_CONCEPT_ID'] = 9201

# Define the base date
base_date = datetime(2011, 1, 1)

# Calculate and assign VISIT_START_DATETIME based on admission time in minutes from the base date
df_visit_occ['VISIT_START_DATETIME'] = pd.to_datetime(base_date) + pd.to_timedelta(df_visit_occ['admission_time'], unit='min')

# Extract the date part for VISIT_START_DATE
df_visit_occ['VISIT_START_DATE'] = pd.to_datetime(df_visit_occ['VISIT_START_DATETIME'].dt.date)

# Calculate and assign VISIT_END_DATETIME based on discharge time in minutes from the base date
df_visit_occ['VISIT_END_DATETIME'] = pd.to_datetime(base_date)  + pd.to_timedelta(df_visit_occ['discharge_time'], unit='min')

# Extract the date part for VISIT_END_DATE
df_visit_occ['VISIT_END_DATE'] = pd.to_datetime(df_visit_occ['VISIT_END_DATETIME'].dt.date)

# Assign the VISIT_TYPE_CONCEPT_ID indicating the data source is an EHR since it is not specified
df_visit_occ['VISIT_TYPE_CONCEPT_ID'] = 32817

# Assign default values for PROVIDER_ID and CARE_SITE_ID
#df_visit_occ['PROVIDER_ID'] = 0
#df_visit_occ['CARE_SITE_ID'] = 0

## Mapping PRECEIDING_VISIT_OCCURRENCE_ID
# Generate a column with the previous 'subject_id' for determining preceding visit occurrence
df_visit_occ['prev_subject_id'] = df_visit_occ['subject_id'].shift(1).astype('Int64')

# Create a new boolean column 'nadm' to check if the current row's subject_id matches the previous one
df_visit_occ['nadm'] = df_visit_occ['subject_id'] == df_visit_occ['prev_subject_id']

# Set the first row's 'nadm' value to False since there's no preceding record
df_visit_occ.at[0, 'nadm'] = False 

# Compute PRECEDING_VISIT_OCCURRENCE_ID based on 'nadm'
df_visit_occ['PRECEDING_VISIT_OCCURRENCE_ID'] = np.where(df_visit_occ['nadm'], df_visit_occ['VISIT_OCCURRENCE_ID'].shift(1), np.nan)
df_visit_occ['PRECEDING_VISIT_OCCURRENCE_ID'] = df_visit_occ['PRECEDING_VISIT_OCCURRENCE_ID'].astype('Int64')

 # Remove columns that aren't part of the final VISIT_OCCURRENCE table format except hadm_id
df_visit_occ.drop(columns=usecols[1:], inplace=True)

# Save the processed data to a CSV file
df_visit_occ.to_csv('INSPIRE_ETL/INSPIRE_VISIT_OCCURRENCE.csv', index=False)

In [22]:
df_visit_occ

Unnamed: 0,VISIT_OCCURRENCE_ID,PERSON_ID,hadm_id,VISIT_CONCEPT_ID,VISIT_START_DATETIME,VISIT_START_DATE,VISIT_END_DATETIME,VISIT_END_DATE,VISIT_TYPE_CONCEPT_ID,prev_subject_id,nadm,PRECEDING_VISIT_OCCURRENCE_ID
0,3000001,1000001,229842382,9201,2011-01-01,2011-01-01,2011-01-05 23:55:00,2011-01-05,32817,,False,
1,3000002,1000002,257857903,9201,2011-01-01,2011-01-01,2011-02-18 23:55:00,2011-02-18,32817,178742874,False,
2,3000003,1000003,200664328,9201,2011-01-01,2011-01-01,2011-05-04 23:55:00,2011-05-04,32817,158995752,False,
3,3000004,1000003,288713144,9201,2011-12-26,2011-12-26,2012-04-26 23:55:00,2012-04-26,32817,108553242,True,3000003
4,3000005,1000004,277235295,9201,2011-01-01,2011-01-01,2011-01-04 23:55:00,2011-01-04,32817,108553242,False,
...,...,...,...,...,...,...,...,...,...,...,...,...
126749,3126750,1099896,265373847,9201,2011-01-01,2011-01-01,2011-01-16 23:55:00,2011-01-16,32817,165516522,False,
126750,3126751,1099897,273139806,9201,2011-01-01,2011-01-01,2011-01-05 23:55:00,2011-01-05,32817,163862002,False,
126751,3126752,1099898,275833861,9201,2011-01-01,2011-01-01,2011-01-08 23:55:00,2011-01-08,32817,126772283,False,
126752,3126753,1099899,293939099,9201,2011-01-01,2011-01-01,2011-01-19 23:55:00,2011-01-19,32817,144363433,False,


# VISIT_DETAIL

* **Table Description**  
  The VISIT_DETAIL table is an optional table used to represents details of each record in the parent VISIT_OCCURRENCE table. A good example of this would be the movement between units in a hospital during an inpatient stay or claim lines associated with a one insurance claim. For every record in the VISIT_OCCURRENCE table there may be 0 or more records in the VISIT_DETAIL table with a 1:n relationship where n may be 0. The VISIT_DETAIL table is structurally very similar to VISIT_OCCURRENCE table and belongs to the visit domain.


  VISIT_DETAIL: ICU 기록이 있는 경우 매핑

In [128]:
## previous version (visit_detail = visit_occurrence)
# Define columns for the VISIT_DETAIL table
columns = ['VISIT_DETAIL_ID',
 'PERSON_ID',
 'VISIT_DETAIL_CONCEPT_ID',
 'VISIT_DETAIL_START_DATE',
 'VISIT_DETAIL_START_DATETIME',
 'VISIT_DETAIL_END_DATE',
 'VISIT_DETAIL_END_DATETIME',
 'VISIT_DETAIL_TYPE_CONCEPT_ID',
 'PROVIDER_ID',
 'CARE_SITE_ID',
 'VISIT_DETAIL_SOURCE_VALUE',
 'VISIT_DETAIL_SOURCE_CONCEPT_ID',
 'ADMITTING_SOURCE_VALUE',
 'ADMITTING_SOURCE_CONCEPT_ID',
 'DISCHARGE_TO_SOURCE_VALUE',
 'DISCHARGE_TO_CONCEPT_ID',
 'PRECEDING_VISIT_DETAIL_ID',
 'VISIT_DETAIL_PARENT_ID',
 'VISIT_OCCURRENCE_ID']

# Initialize an empty DataFrame for VISIT_DETAIL with the predefined columns
df_visit_detail = pd.DataFrame(columns=columns)

# Load the VISIT_OCCURRENCE data from the CSV file
df_visit_occ = pd.read_csv('INSPIRE_ETL/INSPIRE_VISIT_OCCURRENCE.csv')

# Map VISIT_OCCURRENCE_ID from df_visit_occ to VISIT_DETAIL_ID in df_visit_detail
df_visit_detail['VISIT_DETAIL_ID'] = df_visit_occ['VISIT_OCCURRENCE_ID']

# Copy PERSON_ID values from df_visit_occ to df_visit_detail
df_visit_detail['PERSON_ID'] = df_visit_occ['PERSON_ID']

# Assign a default value for VISIT_DETAIL_CONCEPT_ID (Inpatient visit)
df_visit_detail['VISIT_DETAIL_CONCEPT_ID'] = 9201

# Map visit start and end dates and datetimes from df_visit_occ to df_visit_detail
df_visit_detail['VISIT_DETAIL_START_DATE'] = df_visit_occ['VISIT_START_DATE']
df_visit_detail['VISIT_DETAIL_START_DATETIME'] = df_visit_occ['VISIT_START_DATETIME']
df_visit_detail['VISIT_DETAIL_END_DATE'] = df_visit_occ['VISIT_END_DATE']
df_visit_detail['VISIT_DETAIL_END_DATETIME'] = df_visit_occ['VISIT_END_DATETIME']

# Assign the VISIT_TYPE_CONCEPT_ID indicating the data source is an EHR since it is not specified
df_visit_detail['VISIT_TYPE_CONCEPT_ID'] = 32817

# Set the default value for PROVIDER_ID
#df_visit_detail['PROVIDER_ID'] = 0

# Map PRECEDING_VISIT_OCCURRENCE_ID from df_visit_occ to PRECEDING_VISIT_DETAIL_ID in df_visit_detail
df_visit_detail['PRECEDING_VISIT_DETAIL_ID'] = df_visit_occ['PRECEDING_VISIT_OCCURRENCE_ID']

# Map VISIT_OCCURRENCE_ID from df_visit_occ to VISIT_OCCURRENCE_ID in df_visit_detail
df_visit_detail['VISIT_OCCURRENCE_ID'] = df_visit_occ['VISIT_OCCURRENCE_ID']

# Save the processed VISIT_DETAIL data to a CSV file
df_visit_detail.to_csv('INSPIRE_ETL/INSPIRE_VISIT_DETAIL.csv', index=False)

In [34]:
# Create a new DataFrame for VISIT_DETAIL data with the specified columns
df_visit_detail = pd.DataFrame(columns=['VISIT_DETAIL_ID'])

# Populate the PERSON_ID and subject_id columns with data from the df_person DataFrame
df_visit_detail['PERSON_ID'] = df_person['PERSON_ID']
df_visit_detail['subject_id'] = df_person['PERSON_SOURCE_VALUE']

# Integrate visit detail data with operational data from df_op using 'subject_id'
usecols = ['hadm_id', 'subject_id', 'icuin_time', 'icuout_time']
df_visit_detail = df_visit_detail.merge(df_op[usecols], on='subject_id')

# Remove duplicate visit records based on hospital admission ID and ICU admission time
df_visit_detail.drop_duplicates(subset=['hadm_id', 'icuin_time'], keep='first', inplace=True, ignore_index=True)
# Exclude rows with missing ICU admission time
df_visit_detail.dropna(subset='icuin_time', inplace=True, ignore_index=True)

# Generate unique sequential IDs for VISIT_DETAIL_ID
df_visit_detail['VISIT_DETAIL_ID'] = start_index['visit_detail'] + np.arange(len(df_visit_detail)) + 1

# Designate a concept ID representing ICU visits
df_visit_detail['VISIT_DETAIL_CONCEPT_ID'] = 32037

# Calculate visit start and end datetime values using base_date and ICU admission/discharge times
base_date = datetime(2011, 1, 1)
df_visit_detail['VISIT_DETAIL_START_DATETIME'] = pd.to_datetime(base_date) + pd.to_timedelta(df_visit_detail['icuin_time'], unit='min')
df_visit_detail['VISIT_DETAIL_START_DATE'] = pd.to_datetime(df_visit_detail['VISIT_DETAIL_START_DATETIME'].dt.date)
df_visit_detail['VISIT_DETAIL_END_DATETIME'] = pd.to_datetime(base_date) + pd.to_timedelta(df_visit_detail['icuout_time'], unit='min')
df_visit_detail['VISIT_DETAIL_END_DATE'] = pd.to_datetime(df_visit_detail['VISIT_DETAIL_END_DATETIME'].dt.date)

# Specify the concept ID for the visit detail type as sourced from EHR
df_visit_detail['VISIT_DETAIL_TYPE_CONCEPT_ID'] = 32817

# Determine preceding visits for each entry by comparing 'hadm_id' with its previous entry
df_visit_detail['prev_hadm_id'] = df_visit_detail['hadm_id'].shift(1).astype('Int64')
df_visit_detail['nadm'] = df_visit_detail['hadm_id'] == df_visit_detail['prev_hadm_id']
df_visit_detail.at[0, 'nadm'] = False  # The first entry won't have a preceding visit
df_visit_detail['PRECEDING_VISIT_DETAIL_ID'] = np.where(df_visit_detail['nadm'], df_visit_detail['VISIT_DETAIL_ID'].shift(1), np.nan)
df_visit_detail['PRECEDING_VISIT_DETAIL_ID'] = df_visit_detail['PRECEDING_VISIT_DETAIL_ID'].astype('Int64')

# Merge with df_visit_occ to fetch 'VISIT_OCCURRENCE_ID' values
df_visit_detail['VISIT_OCCURRENCE_ID'] = df_visit_detail.merge(df_visit_occ[['hadm_id', 'VISIT_OCCURRENCE_ID']], on='hadm_id', how='left')['VISIT_OCCURRENCE_ID']

# Retain only the required columns including 'hadm_id' for the final DataFrame
df_visit_detail.drop(columns=usecols[1:], inplace=True)

# Save the processed VISIT_DETAIL data to a CSV file
df_visit_detail.to_csv('INSPIRE_ETL/INSPIRE_VISIT_DETAIL.csv', index=False)

In [24]:
df_visit_detail

Unnamed: 0,VISIT_DETAIL_ID,PERSON_ID,hadm_id,VISIT_DETAIL_CONCEPT_ID,VISIT_DETAIL_START_DATETIME,VISIT_DETAIL_START_DATE,VISIT_DETAIL_END_DATETIME,VISIT_DETAIL_END_DATE,VISIT_DETAIL_TYPE_CONCEPT_ID,prev_hadm_id,nadm,PRECEDING_VISIT_DETAIL_ID,VISIT_OCCURRENCE_ID
0,4000001,1000002,257857903,32037,2011-01-02 01:50:00,2011-01-02,2011-01-14 14:35:00,2011-01-14,32817,,False,,3000002
1,4000002,1000015,259299532,32037,2011-02-13 22:35:00,2011-02-13,2011-02-24 13:20:00,2011-02-24,32817,257857903,False,,3000028
2,4000003,1000023,238035661,32037,2011-01-08 16:40:00,2011-01-08,2011-01-14 13:15:00,2011-01-14,32817,259299532,False,,3000041
3,4000004,1000024,267911567,32037,2011-01-05 22:15:00,2011-01-05,2011-01-06 15:25:00,2011-01-06,32817,238035661,False,,3000042
4,4000005,1000025,285012362,32037,2011-01-02 16:55:00,2011-01-02,2011-01-03 19:40:00,2011-01-03,32817,267911567,False,,3000043
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14205,4014206,1099827,271165170,32037,2011-01-02 12:55:00,2011-01-02,2011-01-03 15:15:00,2011-01-03,32817,253900635,False,,3126681
14206,4014207,1099828,276660326,32037,2011-01-02 17:15:00,2011-01-02,2011-01-03 12:15:00,2011-01-03,32817,271165170,False,,3126682
14207,4014208,1099868,274747413,32037,2011-01-09 15:15:00,2011-01-09,2011-01-11 14:30:00,2011-01-11,32817,276660326,False,,3126722
14208,4014209,1099874,298918264,32037,2011-01-05 20:10:00,2011-01-05,2011-01-06 13:35:00,2011-01-06,32817,274747413,False,,3126728


In [23]:
df_visit_detail.groupby('hadm_id').filter(lambda x: len(x['icuin_time'].unique()) > 1)

Unnamed: 0,PERSON_ID,DEATH_DATE,DEATH_DATETIME,DEATH_TYPE_CONCEPT_ID,CAUSE_CONCEPT_ID,CAUSE_SOURCE_VALUE,CAUSE_SOURCE_CONCEPT_ID,subject_id,hadm_id,icuin_time,icuout_time
45,26,,,,,,,157101224,262140617,,
46,26,,,,,,,157101224,262140617,,
47,26,,,,,,,157101224,262140617,27395.0,28395.0
108,62,,,,,,,115517784,270999584,18790.0,44125.0
109,62,,,,,,,115517784,270999584,100505.0,101630.0
...,...,...,...,...,...,...,...,...,...,...,...
130104,98930,,,,,,,100382443,230742603,3980.0,5200.0
130105,98930,,,,,,,100382443,230742603,,
130106,98930,,,,,,,100382443,230742603,,
130213,99036,,,,,,,168713033,264040571,1665.0,2535.0


# CONDITION_OCCURRENCE

In [25]:
# Create an empty DataFrame using the predefined column names
df_cond_occ = pd.DataFrame(columns=['CONDITION_OCCURRENCE_ID'])

# Map PERSON_ID values from the df_person DataFrame to the new CONDITION_OCCURRENCE DataFrame
df_cond_occ['PERSON_ID'] = df_person['PERSON_ID']

# Transfer subject_id values (stored as PERSON_SOURCE_VALUE) from df_person to df_cond_occ
df_cond_occ['subject_id'] = df_person['PERSON_SOURCE_VALUE']

# Merge the df_cond_occ DataFrame with df_diag using the 'subject_id' as a common column
df_cond_occ = df_cond_occ.merge(df_diag, on = 'subject_id', how = 'left')

# Generate unique IDs for each row in the CONDITION_OCCURRENCE table
df_cond_occ['CONDITION_OCCURRENCE_ID'] = start_index['condition_occurrence'] + np.arange(len(df_cond_occ)) + 1

# Rename columns to match the target column names in the CONDITION_OCCURRENCE table
df_cond_occ.rename(columns={'standard_concept_id': 'CONDITION_CONCEPT_ID', 
                            'source_value': 'CONDITION_SOURCE_VALUE', 
                            'source_concept_id': 'CONDITION_SOURCE_CONCEPT_ID'}, inplace=True)

# Establish a reference starting date for generating dates in the observation period
base_date = datetime(2011, 1, 1)

# Convert 'chart_time' values (in minutes) to datetime objects, with the reference as the base_date
df_cond_occ['CONDITION_START_DATETIME'] = base_date + pd.to_timedelta(df_cond_occ['chart_time'], unit='min')
df_cond_occ['CONDITION_START_DATE'] = pd.to_datetime(df_cond_occ['CONDITION_START_DATETIME'].dt.date)

# Set end dates equal to start dates as there's no separate end time
df_cond_occ['CONDITION_END_DATETIME'] = df_cond_occ['CONDITION_START_DATETIME']
df_cond_occ['CONDITION_END_DATE'] = df_cond_occ['CONDITION_START_DATE']

# Assign the CONDITION_TYPE_CONCEPT_ID indicating the data source is an EHR since it is not specified
df_cond_occ['CONDITION_TYPE_CONCEPT_ID'] = 32817

## Match visit_occurrence_id, visit_detail based on chart_time
# Convert 'chart_time' values (in minutes) to dates using the reference base_date
df_cond_occ['chart_date'] = base_date + pd.to_timedelta(df_cond_occ['chart_time'], unit='min')

# Match relevant VISIT_OCCURRENCE_ID
# Merge on 'PERSON_ID' to match the 'chart_date' with the relevant visit dates in df_visit_occ
result = pd.merge(df_cond_occ[['PERSON_ID', 'chart_date', 'CONDITION_OCCURRENCE_ID']], 
                df_visit_occ, on='PERSON_ID', how='left')
result = result[(result['chart_date'] >= result['VISIT_START_DATETIME']) & 
                (result['chart_date'] <= result['VISIT_END_DATETIME'])]

# Merge the filtered results with df_cond_occ to add 'VISIT_OCCURRENCE_ID' details
df_cond_occ['VISIT_OCCURRENCE_ID'] = df_cond_occ.merge(result[['CONDITION_OCCURRENCE_ID', 'VISIT_OCCURRENCE_ID']], 
                                on='CONDITION_OCCURRENCE_ID', 
                                how='left', 
                                suffixes=('_x', None))['VISIT_OCCURRENCE_ID']

## Match relevant VISIT_DETAIL_ID based on chart_time
# Merge on 'PERSON_ID' to match the 'chart_date' with the relevant visit dates in df_visit_detail
result = pd.merge(df_cond_occ[['PERSON_ID', 'chart_date', 'CONDITION_OCCURRENCE_ID']], 
                df_visit_detail, on='PERSON_ID', how='left')
result = result[(result['chart_date'] >= result['VISIT_DETAIL_START_DATETIME']) & 
                (result['chart_date'] <= result['VISIT_DETAIL_END_DATETIME'])]

# Merge the filtered results with df_cond_occ to add 'VISIT_DETAIL_ID' details
df_cond_occ['VISIT_DETAIL_ID'] = df_cond_occ.merge(result[['CONDITION_OCCURRENCE_ID', 'VISIT_DETAIL_ID']], 
                                on='CONDITION_OCCURRENCE_ID', 
                                how='left', 
                                suffixes=('_x', None))['VISIT_DETAIL_ID']

# Free up memory by deleting result
del result

# Filter columns to keep only the ones defined initially
df_cond_occ.drop(columns=['subject_id', 'chart_date'], inplace=True)
df_cond_occ = df_cond_occ.astype({'CONDITION_SOURCE_CONCEPT_ID':'Int64', 'CONDITION_CONCEPT_ID':'Int64', 'VISIT_OCCURRENCE_ID':'Int64', 'VISIT_DETAIL_ID':'Int64'})

# Save the final df_cond_occ DataFrame to a CSV file
df_cond_occ.to_csv('INSPIRE_ETL/INSPIRE_CONDITION_OCCURRENCE.csv', index=False)

In [32]:
df_cond_occ

Unnamed: 0,CONDITION_OCCURRENCE_ID,PERSON_ID,chart_time,CONDITION_SOURCE_VALUE,CONDITION_SOURCE_CONCEPT_ID,CONDITION_CONCEPT_ID,CONDITION_START_DATETIME,CONDITION_START_DATE,CONDITION_END_DATETIME,CONDITION_END_DATE,CONDITION_TYPE_CONCEPT_ID,VISIT_OCCURRENCE_ID,VISIT_DETAIL_ID
0,5000001,1000001,0.0,Z94,45562431,260427,2011-01-01,2011-01-01,2011-01-01,2011-01-01,32817,3000001,
1,5000002,1000001,0.0,T81,45581629,375545,2011-01-01,2011-01-01,2011-01-01,2011-01-01,32817,3000001,
2,5000003,1000001,5760.0,Z94,45571518,4162253,2011-01-05,2011-01-05,2011-01-05,2011-01-05,32817,3000001,
3,5000004,1000001,-205920.0,H17,45543186,443454,2010-08-11,2010-08-11,2010-08-11,2010-08-11,32817,,
4,5000005,1000001,-205920.0,H05,45571405,1340204,2010-08-11,2010-08-11,2010-08-11,2010-08-11,32817,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4737162,9737163,1099900,-44640.0,K81,45591621,435613,2010-12-01,2010-12-01,2010-12-01,2010-12-01,32817,3126752,
4737163,9737164,1099900,-11520.0,K81,45568132,433316,2010-12-24,2010-12-24,2010-12-24,2010-12-24,32817,,
4737164,9737165,1099900,-2880.0,Z11,45571611,40492458,2010-12-30,2010-12-30,2010-12-30,2010-12-30,32817,,
4737165,9737166,1099900,0.0,K81,45547484,199754,2011-01-01,2011-01-01,2011-01-01,2011-01-01,32817,,


In [54]:
df_cond_occ

Unnamed: 0,CONDITION_OCCURRENCE_ID,PERSON_ID,CONDITION_CONCEPT_ID,CONDITION_CONCEPT_ID.1,CONDITION_START_DATE,CONDITION_START_DATETIME,CONDITION_END_DATE,CONDITION_END_DATETIME,CONDITION_TYPE_CONCEPT_ID,CONDITION_STATUS_CONCEPT_ID,STOP_REASON,PROVIDER_ID,VISIT_OCCURRENCE_ID,VISIT_DETAIL_ID,CONDITION_SOURCE_VALUE,CONDITION_SOURCE_VALUE.1,CONDITION_SOURCE_CONCEPT_ID,CONDITION_SOURCE_CONCEPT_ID.1,CONDITION_STATUS_SOURCE_VALUE
0,1,1,,260427.0,2011-01-01,2011-01-01,,2011-01-01,32817,,,,1.0,,,Z94,,45562431.0,
1,2,1,,375545.0,2011-01-01,2011-01-01,,2011-01-01,32817,,,,1.0,,,T81,,45581629.0,
2,3,1,,4162253.0,2011-01-05,2011-01-05,,2011-01-05,32817,,,,1.0,,,Z94,,45571518.0,
3,4,1,,443454.0,2010-08-11,2010-08-11,,2010-08-11,32817,,,,,,,H17,,45543186.0,
4,5,1,,1340204.0,2010-08-11,2010-08-11,,2010-08-11,32817,,,,,,,H05,,45571405.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4737241,4737163,99900,,435613.0,2010-12-01,2010-12-01,,2010-12-01,32817,,,,,,,K81,,45591621.0,
4737242,4737164,99900,,433316.0,2010-12-24,2010-12-24,,2010-12-24,32817,,,,,,,K81,,45568132.0,
4737243,4737165,99900,,40492458.0,2010-12-30,2010-12-30,,2010-12-30,32817,,,,,,,Z11,,45571611.0,
4737244,4737166,99900,,199754.0,2011-01-01,2011-01-01,,2011-01-01,32817,,,,126754.0,,,K81,,45547484.0,


# DRUG_EXPOSURE

In [51]:
df_vitals

Unnamed: 0,op_id,subject_id,chart_time,item_name,value,Unit,concept_id,vocab
0,435959808,181409183,1985,minvol,4.8,L/min,42527120.0,LOINC
1,435959808,181409183,1985,vt,520.0,mL,21490854.0,LOINC
2,435959808,181409183,1985,rr,9.0,/min,3024171.0,LOINC
3,435959808,181409183,1985,pip,23.0,cmH2O,21490650.0,LOINC
4,435959808,181409183,2005,minvol,4.5,L/min,42527120.0,LOINC
...,...,...,...,...,...,...,...,...
64443619,447098707,159399111,1511530,nibp_dbp,94.0,mmHg,21492240.0,LOINC
64443620,447098707,159399111,1511530,nibp_mbp,123.0,mmHg,21492241.0,LOINC
64443621,447098707,159399111,1511530,spo2,100.0,%,3013502.0,LOINC
64443622,447098707,159399111,1511535,hr,80.0,/min,3027018.0,LOINC


In [119]:
#df_vitals = pd.read_parquet('inspire_v2/mapped/vitals.parquet')

# Create an empty dataframe for DRUG_EXPOSURE table
df_drug = pd.DataFrame(columns = ['DRUG_EXPOSURE_ID'])

# Copy PERSON_ID values from the PERSON table to the DRUG_EXPOSURE table
df_drug['PERSON_ID'] = df_person['PERSON_ID']
# Copy PERSON_SOURCE_VALUE values as subject_id from df_person to df_drug
df_drug['subject_id'] = df_person['PERSON_SOURCE_VALUE']

# Filter out rows in df_vitals with null 'vocab' values
df_v = df_vitals.dropna(subset='vocab')
# Filter rows in df_vitals where 'vocab' column contains the string 'RxNorm'
df_v = df_v.loc[df_v['vocab'].str.contains('RxNorm')]
# Select relevant columns and rename 'item_name' to 'drug_name'
df_v = df_v[['subject_id', 'chart_time', 'item_name', 'value', 'concept_id']]
df_v.rename(columns={'item_name': 'drug_name'}, inplace=True)
# Assign a source where the data come from
df_v['source'] = 'vital'
# Assign a default value 'iv' to the new 'route' column
df_v['route'] = 'iv'

# Remove the 'concept_name' column from df_medi
df_m = df_medi.drop(columns='concept_name')
# Assign a source where the data come from
df_m['source'] = 'medi'
# Concatenate df_v and df_m vertically
df_merge = pd.concat([df_v, df_m], axis = 0)

# Free up memory by deleting df_v and df_m
del df_v, df_m

# Merge df_drug with df_merge on 'subject_id' to add details from df_merge
df_drug = df_drug.merge(df_merge, on='subject_id', how='left')

# Assign unique sequential IDs to the 'DRUG_EXPOSURE_ID' column
df_drug['DRUG_EXPOSURE_ID'] = start_index['drug_exposure'] + np.arange(len(df_drug)) + 1
# Map 'concept_id' values to 'DRUG_CONCEPT_ID' column
df_drug['DRUG_CONCEPT_ID'] = df_drug['concept_id']

# Define the reference date for drug exposure start and end times
base_date = datetime(2011, 1, 1)
# Convert 'chart_time' values (in minutes) to dates using the reference base_date
df_drug['DRUG_EXPOSURE_START_DATETIME'] = pd.to_datetime(base_date) + pd.to_timedelta(df_drug['chart_time'], unit='min')
# Extract the date part for DRUG_EXPOSURE_START_DATE
df_drug['DRUG_EXPOSURE_START_DATE'] = pd.to_datetime(df_drug['DRUG_EXPOSURE_START_DATETIME'].dt.date)
# Assign the start datetime to the end datetime column (assuming no gap)
df_drug['DRUG_EXPOSURE_END_DATETIME'] = df_drug['DRUG_EXPOSURE_START_DATETIME']
# Assign the start date to the end date column
df_drug['DRUG_EXPOSURE_END_DATE'] = df_drug['DRUG_EXPOSURE_START_DATE']

# Assign 32838 (EHR Episode Record) for Intra-Op record(Vital), and 32817 (EHR) for others (Post-Op, Pre-Op).
df_drug['DRUG_TYPE_CONCEPT_ID'] = df_drug['source'].map({'vital': 32838, 'medi': 32817})

# Map drug quantity values from 'value' column
df_drug['QUANTITY'] = df_drug['value']

# Map drug administration route to corresponding Standard Concept IDs
df_drug['ROUTE_CONCEPT_ID'] = df_drug['route'].map({'po': 4132161, 'iv': 4171047, 'ex': 4263689}, na_action='ignore')

## Match visit_occurrence_id, visit_detail_id based on chart_time
# Convert 'chart_time' values (in minutes) again for merging with visit occurrences
df_drug['chart_date'] = pd.to_datetime(base_date) + pd.to_timedelta(df_drug['chart_time'], unit='min')

# Match drug exposure dates with visit occurrences based on 'PERSON_ID'
result = pd.merge(df_drug[['PERSON_ID', 'chart_date', 'DRUG_EXPOSURE_ID']], 
                  df_visit_occ, on='PERSON_ID', how='left')
# Filter results to keep only those rows where 'chart_date' falls within a visit's start and end times
result = result[(result['chart_date'] >= result['VISIT_START_DATETIME']) & 
                (result['chart_date'] <= result['VISIT_END_DATETIME'])]

# Merge the filtered results with df_cond_occ to add 'VISIT_OCCURRENCE_ID' details to df_drug
df_drug['VISIT_OCCURRENCE_ID'] = df_drug.merge(result[['DRUG_EXPOSURE_ID', 'VISIT_OCCURRENCE_ID']], 
                                on='DRUG_EXPOSURE_ID', 
                                how='left')['VISIT_OCCURRENCE_ID']
                                
# Match drug exposure dates with visit occurrences based on 'PERSON_ID'
result = pd.merge(df_drug[['PERSON_ID', 'chart_date', 'DRUG_EXPOSURE_ID']], 
                  df_visit_detail, on='PERSON_ID', how='left')
# Filter results to keep only those rows where 'chart_date' falls within a visit's start and end times
result = result[(result['chart_date'] >= result['VISIT_DETAIL_START_DATETIME']) & 
                (result['chart_date'] <= result['VISIT_DETAIL_END_DATETIME'])]

# Merge the filtered results with df_cond_occ to add 'VISIT_DETAIL_ID' details to df_drug
df_drug['VISIT_DETAIL_ID'] = df_drug.merge(result[['DRUG_EXPOSURE_ID', 'VISIT_DETAIL_ID']], 
                                on='DRUG_EXPOSURE_ID', 
                                how='left')['VISIT_DETAIL_ID']      

# Free up memory by deleting result
del result

# Map drug names to the 'DRUG_SOURCE_VALUE' column
df_drug['DRUG_SOURCE_VALUE'] = df_drug['drug_name']

# Map drug routes to the 'ROUTE_SOURCE_VALUE' column
df_drug['ROUTE_SOURCE_VALUE'] = df_drug['route']

# Filter the columns in df_drug to keep only the originally defined columns
df_drug.drop(columns=['subject_id', 'chart_time', 'value', 'concept_id', 'drug_name', 'route', 'source', 'chart_date'], inplace=True)
df_drug = df_drug.astype({'DRUG_CONCEPT_ID': 'Int64', 'ROUTE_CONCEPT_ID': 'Int64', 'VISIT_OCCURRENCE_ID':'Int64', 'VISIT_DETAIL_ID':'Int64'})

# Save the final df_drug DataFrame to a CSV file
df_drug.to_csv('INSPIRE_ETL/INSPIRE_DRUG_EXPOSURE.csv', index=False)
df_drug.to_parquet('INSPIRE_ETL/parquet/INSPIRE_DRUG_EXPOSURE.parquet')

In [88]:
df_drug

Unnamed: 0,DRUG_EXPOSURE_ID,PERSON_ID,DRUG_CONCEPT_ID,DRUG_EXPOSURE_START_DATETIME,DRUG_EXPOSURE_START_DATE,DRUG_EXPOSURE_END_DATETIME,DRUG_EXPOSURE_END_DATE,DRUG_TYPE_CONCEPT_ID,QUANTITY,ROUTE_CONCEPT_ID,VISIT_OCCURRENCE_ID,VISIT_DETAIL_ID,DRUG_SOURCE_VALUE,ROUTE_SOURCE_VALUE
0,6000001,1000001,40166953,2011-01-01 17:00:00,2011-01-01 17:00:00,2011-01-01 17:00:00,2011-01-01 17:00:00,32838.0,0.0,4171047,3000001,,hs,iv
1,6000002,1000001,40166953,2011-01-01 19:40:00,2011-01-01 19:40:00,2011-01-01 19:40:00,2011-01-01 19:40:00,32838.0,300.0,4171047,3000001,,hs,iv
2,6000003,1000001,1771162,2011-01-01 15:30:00,2011-01-01 15:30:00,2011-01-01 15:30:00,2011-01-01 15:30:00,32817.0,,4171047,3000001,,cefazolin,iv
3,6000004,1000001,1771162,2011-01-03 09:00:00,2011-01-03 09:00:00,2011-01-03 09:00:00,2011-01-03 09:00:00,32817.0,,4171047,3000001,,cefazolin,iv
4,6000005,1000001,1771162,2011-01-02 09:00:00,2011-01-02 09:00:00,2011-01-02 09:00:00,2011-01-02 09:00:00,32817.0,,4171047,3000001,,cefazolin,iv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7899028,13899029,1099900,1114122,2011-01-04 06:30:00,2011-01-04 06:30:00,2011-01-04 06:30:00,2011-01-04 06:30:00,32817.0,,4171047,3126753,,nalbuphine,iv
7899029,13899030,1099900,19015602,2011-01-03 10:20:00,2011-01-03 10:20:00,2011-01-03 10:20:00,2011-01-03 10:20:00,32817.0,,4171047,3126753,,nefopam,iv
7899030,13899031,1099900,1774932,2011-01-03 20:20:00,2011-01-03 20:20:00,2011-01-03 20:20:00,2011-01-03 20:20:00,32817.0,,4171047,3126753,,cefotetan,iv
7899031,13899032,1099900,1774932,2011-01-04 07:40:00,2011-01-04 07:40:00,2011-01-04 07:40:00,2011-01-04 07:40:00,32817.0,,4171047,3126753,,cefotetan,iv


In [139]:
drugs.reset_index(drop=False)

Unnamed: 0,concept_name,count,perc
0,calcium chloride / lactate / potassium chlorid...,63363,63.426426
1,propofol,58802,58.860861
2,ambroxol,53961,54.015015
3,famotidine,52731,52.783784
4,fentanyl,39257,39.296296
...,...,...,...
752,ulipristal,1,0.001001
753,carbomer,1,0.001001
754,rotigotine,1,0.001001
755,temsirolimus,1,0.001001


In [174]:
# calculate the counts of medicine used in patients
df_med_concept = df_concept[((df_concept['vocabulary_id'] == 'RxNorm') | (df_concept['vocabulary_id'] == 'RxNorm Extension')) & (df_concept['standard_concept'] == 'S')][['concept_name', 'concept_id']]
df_med_concept['concept_name'] = df_med_concept['concept_name'].str.lower()
df_med_concept.rename(columns={'concept_id': 'DRUG_CONCEPT_ID'}, inplace=True)

drug_counts = df_drug[['PERSON_ID', 'DRUG_CONCEPT_ID']].drop_duplicates(subset=['PERSON_ID', 'DRUG_CONCEPT_ID'])
drug_counts = drug_counts.merge(df_med_concept, on='DRUG_CONCEPT_ID', how='left')

drugs = drug_counts['concept_name'].value_counts().to_frame()
drugs = drugs.reset_index(drop=False)
drugs['perc'] = drugs['count'] / len(df_person) * 100
drugs['concept_id'] = drugs.merge(df_med_concept, on='concept_name', how='left')['DRUG_CONCEPT_ID']

drugs.rename(columns={'concept_name':'ingredient'}, inplace=True)
drugs.to_csv('inspire_v+medi_counts.csv', index=False)

drugs_medi = pd.read_csv('results/mover_medi_counts.csv')
drugs_medi['concept_id'] = drugs_medi['concept_id'].astype('Int64')
#drugs_medi.drop(columns='concept_id', inplace=True)

In [181]:
medi_total = pd.concat([drugs, drugs_medi], axis=0).groupby('concept_id').agg({'ingredient': 'first', 'count': 'sum', 'perc': 'sum'}).sort_values(by='count', ascending=False)
medi_total

Unnamed: 0_level_0,ingredient,count,perc
concept_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
753626,propofol,95712.0,154.252792
1154029,fentanyl,75377.0,132.646515
40166953,calcium chloride / lactate / potassium chlorid...,63363.0,63.426426
953076,famotidine,61721.0,76.017961
989878,lidocaine,57095.0,113.370812
...,...,...,...
754270,protriptyline hcl,1.0,0.002584
1836391,sulfadiazine,1.0,0.002584
715458,tiagabine hcl,1.0,0.002584
1836948,tetracycline,1.0,0.001001


In [182]:
medi_total = medi_total.merge(drugs, on='ingredient', how='left', suffixes=(None, '_inspire'))
medi_total = medi_total.merge(drugs_medi, on='ingredient', how='left', suffixes=(None, '_mover'))
medi_total

Unnamed: 0,ingredient,count,perc,count_inspire,perc_inspire,concept_id,concept_id_mover,count_mover,perc_mover
0,propofol,95712.0,154.252792,58802.0,58.860861,753626.0,753626,36895.0,95.353165
1,fentanyl,75377.0,132.646515,39257.0,39.296296,1154029.0,1154029,1859.0,4.804487
2,calcium chloride / lactate / potassium chlorid...,63363.0,63.426426,63363.0,63.426426,40166953.0,,,
3,famotidine,61721.0,76.017961,52731.0,52.783784,953076.0,953076,8985.0,23.221254
4,lidocaine,57095.0,113.370812,21591.0,21.612613,989878.0,989878,16594.0,42.886310
...,...,...,...,...,...,...,...,...,...
1231,protriptyline hcl,1.0,0.002584,,,,754270,1.0,0.002584
1232,sulfadiazine,1.0,0.002584,,,,1836391,1.0,0.002584
1233,tiagabine hcl,1.0,0.002584,,,,715458,1.0,0.002584
1234,tetracycline,1.0,0.001001,1.0,0.001001,1836948.0,,,


In [183]:
medi_total.to_csv('results/total_medi_counts.csv', index=False)

# PROCEDURE_OCCURRENCE

In [59]:
# Define the columns for the PROCEDURE_OCCURRENCE table
columns = [
    'PROCEDURE_OCCURRENCE_ID', 'PERSON_ID', 'PROCEDURE_CONCEPT_ID', 'PROCEDURE_DATE',
    'PROCEDURE_DATETIME', 'PROCEDURE_TYPE_CONCEPT_ID', 'MODIFIER_CONCEPT_ID',
    'QUANTITY', 'PROVIDER_ID', 'VISIT_OCCURRENCE_ID', 'VISIT_DETAIL_ID',
    'PROCEDURE_SOURCE_VALUE', 'PROCEDURE_SOURCE_CONCEPT_ID', 'MODIFIER_SOURCE_VALUE'
]

# Initialize an empty DataFrame for the PROCEDURE_OCCURRENCE table using the specified columns
df_proc = pd.DataFrame(columns=columns)

# Map corresponding PERSON_ID values from the PERSON table to the new PROCEDURE_OCCURRENCE DataFrame
df_proc['PERSON_ID'] = df_person['PERSON_ID']

# Extract subject_id from PERSON_SOURCE_VALUE for mapping with the operation data
df_proc['subject_id'] = df_person['PERSON_SOURCE_VALUE']

# Merge operation data with the newly initialized df_proc based on subject_id
df_proc = df_proc.merge(df_op, on='subject_id', how='left')

# Generate unique identifiers for each procedure occurrence
df_proc['PROCEDURE_OCCURRENCE_ID'] = np.arange(len(df_proc)) + 1

# Assign the standard concept IDs to the procedure concept ID column
df_proc['PROCEDURE_CONCEPT_ID'] = df_proc['standard_concept_id']

# Convert operation start and end times to datetime format using a defined base date
base_date = datetime(2011, 1, 1)
df_proc['PROCEDURE_DATETIME'] = base_date + pd.to_timedelta(df_proc['opstart_time'], unit='min')
df_proc['PROCEDURE_DATE'] = df_proc['PROCEDURE_DATETIME'].dt.date
df_proc['PROCEDURE_END_DATETIME'] = base_date + pd.to_timedelta(df_proc['opend_time'], unit='min')
df_proc['PROCEDURE_END_DATE'] = df_proc['PROCEDURE_END_DATETIME'].dt.date

# Assign a type concept ID indicating the data is sourced from an EHR
df_proc['PROCEDURE_TYPE_CONCEPT_ID'] = 32817

# Link each procedure to a corresponding visit by merging with the visit occurrence data
df_proc['VISIT_OCCURRENCE_ID'] = df_proc.merge(df_visit_occ[['hadm_id', 'VISIT_OCCURRENCE_ID']], 
                                               on='hadm_id', suffixes=('_x', None), how='left')['VISIT_OCCURRENCE_ID']
df_proc['VISIT_DETAIL_ID'] = df_proc.merge(df_visit_detail[['hadm_id', 'VISIT_DETAIL_ID']], 
                                                on='hadm_id', suffixes=('_x', None), how='left')['VISIT_DETAIL_ID']

# Populate source value and source concept ID columns using the operation data
df_proc['PROCEDURE_SOURCE_VALUE'] = df_proc['icd10_pcs']
df_proc['PROCEDURE_SOURCE_CONCEPT_ID'] = df_proc['source_concept_id']

# Filter the DataFrame to only include the originally defined columns
df_proc = df_proc[columns]

# Export the final PROCEDURE_OCCURRENCE data to a CSV file
df_proc.to_csv('INSPIRE_ETL/INSPIRE_PROCEDURE_OCCURRENCE.csv', index=False)

# MEASUREMENT

In [11]:
start = time.time()
columns = ['MEASUREMENT_ID',
 'PERSON_ID',
 'MEASUREMENT_CONCEPT_ID',
 'MEASUREMENT_DATE',
 'MEASUREMENT_DATETIME',
 'MEASUREMENT_TIME',
 'MEASUREMENT_TYPE_CONCEPT_ID',
 'OPERATOR_CONCEPT_ID',
 'VALUE_AS_NUMBER',
 'VALUE_AS_CONCEPT_ID',
 'UNIT_CONCEPT_ID',
 'RANGE_LOW',
 'RANGE_HIGH',
 'PROVIDER_ID',
 'VISIT_OCCURRENCE_ID',
 'VISIT_DETAIL_ID',
 'MEASUREMENT_SOURCE_VALUE',
 'MEASUREMENT_SOURCE_CONCEPT_ID',
 'UNIT_SOURCE_VALUE',
 'VALUE_SOURCE_VALUE']

df_measure = pd.DataFrame(columns = columns)

# Copy PERSON_ID values from the PERSON table to the MEASUREMENT table
df_measure['PERSON_ID'] = df_person['PERSON_ID']
# Copy PERSON_SOURCE_VALUE values as subject_id from df_person to df_measure
df_measure['subject_id'] = df_person['PERSON_SOURCE_VALUE']

# Filter out rows in df_vitals with null 'vocab' values
df_v = df_vitals.dropna(subset='vocab')
df_v = df_v.loc[df_v['vocab']=='LOINC'].drop(columns=['op_id', 'vocab'])

print(f'line34 : {time.time() - start}')
start = time.time()
# Concatenate df_v and df_m vertically
records = pd.concat([df_labs, df_v, df_ward], axis = 0)
records = records.merge(df_params[['Unit', 'unit_concept_id']].drop_duplicates(subset='Unit'), on='Unit', how='left')
del df_v

df_measure = df_measure.merge(records, on='subject_id', how='left')
del records

print(f'line44 : {time.time() - start}')
start = time.time()

# Generate unique identifiers for each measurement
df_measure['MEASUREMENT_ID'] = np.arange(len(df_measure)) + 1

df_measure['MEASUREMENT_CONCEPT_ID'] = df_measure['concept_id']

base_date = datetime(2011, 1, 1)
df_measure['MEASUREMENT_DATETIME'] = base_date + pd.to_timedelta(df_measure['chart_time'], unit='min')
df_measure['MEASUREMENT_DATE'] = df_measure['MEASUREMENT_DATETIME'].dt.date

# Assign a type concept ID indicating the data is sourced from an EHR
df_measure['MEASUREMENT_TYPE_CONCEPT_ID'] = 32817

# Concept for '=' operation
df_measure['OPERATOR_CONCEPT_ID'] = 4172703

print(f'line62 : {time.time() - start}')
start = time.time()
#df_measure['VALUE_AS_NUMBER'] = df_measure['value']
exceptions = [3003396, 3002032, 3006277, 3012501, 3003129, 3004959, 3007435]
valid_mask = (df_measure['value'] >= 0) | (df_measure['concept_id'].isin(exceptions))
df_measure.loc[valid_mask, 'VALUE_AS_NUMBER'] = df_measure['value']
df_measure.loc[~valid_mask, 'VALUE_AS_NUMBER'] = None

print(f'line70 : {time.time() - start}')
start = time.time()
df_measure['UNIT_CONCEPT_ID'] = df_measure['unit_concept_id']

# Convert 'chart_time' values (in minutes) again for merging with visit occurrences
df_measure['chart_date'] = base_date + pd.to_timedelta(df_measure['chart_time'], unit='min')

# Match measure exposure dates with visit occurrences based on 'PERSON_ID'
result = pd.merge(df_measure[['PERSON_ID', 'chart_date', 'MEASUREMENT_ID']], 
                df_visit_occ, on='PERSON_ID', how='left')
# Filter results to keep only those rows where 'chart_date' falls within a visit's start and end times
result = result[(result['chart_date'] >= result['VISIT_START_DATETIME']) & 
                (result['chart_date'] <= result['VISIT_END_DATETIME'])]

# Merge the filtered results with df_cond_occ to add 'VISIT_OCCURRENCE_ID' details to df_measure
df_measure['VISIT_OCCURRENCE_ID'] =  df_measure.merge(result[['MEASUREMENT_ID', 'VISIT_OCCURRENCE_ID']], 
                                on='MEASUREMENT_ID', 
                                how='left', 
                                suffixes=('_x', None))['VISIT_OCCURRENCE_ID']


# Link measurements to their associated visit detail
linked_visits = pd.merge(df_measure[['PERSON_ID', 'chart_date', 'MEASUREMENT_ID']], df_visit_detail, on='PERSON_ID', how='left')
# Filter to include only measurements taken during a visit's duration
linked_visits = linked_visits[(linked_visits['chart_date'] >= linked_visits['VISIT_DETAIL_START_DATETIME']) & 
                              (linked_visits['chart_date'] <= linked_visits['VISIT_DETAIL_END_DATETIME'])]
df_measure['VISIT_DETAIL_ID'] = df_measure.merge(linked_visits[['MEASUREMENT_ID', 'VISIT_DETAIL_ID']], 
                                                     on='MEASUREMENT_ID', how='left',suffixes=('_x', None))['VISIT_DETAIL_ID']

del linked_visits

print(f'line90 : {time.time() - start}')
start = time.time()
#df_measure['VISIT_DETAIL_ID'] = df_measure['VISIT_OCCURRENCE_ID']

df_measure['MEASUREMENT_SOURCE_VALUE'] = df_measure['value']

df_measure['UNIT_SOURCE_VALUE'] = df_measure['Unit']

df_measure = df_measure[columns]

df_measure.to_csv('INSPIRE_ETL/INSPIRE_MEASUREMENT.csv', index=False)

print(f'line102 : {time.time() - start}')

start = time.time()

line34 : 10.768936157226562
line44 : 71.09569549560547
line62 : 26.593946933746338
line70 : 46.20182418823242
line90 : 174.40139317512512
line102 : 887.9683330059052


In [6]:
start = time.time()

# Define the columns for the MEASUREMENT table in OMOP CDM format
columns = ['MEASUREMENT_ID',
 'PERSON_ID',
 'MEASUREMENT_CONCEPT_ID',
 'MEASUREMENT_DATE',
 'MEASUREMENT_DATETIME',
 'MEASUREMENT_TIME',
 'MEASUREMENT_TYPE_CONCEPT_ID',
 'OPERATOR_CONCEPT_ID',
 'VALUE_AS_NUMBER',
 'VALUE_AS_CONCEPT_ID',
 'UNIT_CONCEPT_ID',
 'RANGE_LOW',
 'RANGE_HIGH',
 'PROVIDER_ID',
 'VISIT_OCCURRENCE_ID',
 'VISIT_DETAIL_ID',
 'MEASUREMENT_SOURCE_VALUE',
 'MEASUREMENT_SOURCE_CONCEPT_ID',
 'UNIT_SOURCE_VALUE',
 'VALUE_SOURCE_VALUE']

# Initialize an empty DataFrame for storing MEASUREMENT data
df_measure = pd.DataFrame(columns=columns)

# Populate 'PERSON_ID' and 'subject_id' columns in MEASUREMENT table from the PERSON table
df_measure['PERSON_ID'] = df_person['PERSON_ID']
df_measure['subject_id'] = df_person['PERSON_SOURCE_VALUE']

# Filter out measurements from df_vitals where the vocabulary is not LOINC
df_v = df_vitals.dropna(subset='vocab')
df_v = df_v[df_v['vocab']=='LOINC'].drop(['op_id', 'vocab'], axis=1)

# Combine data from various sources (labs, vitals, wards) into a single DataFrame
records = pd.concat([df_labs, df_v, df_ward], axis=0)
# Enrich the combined records with associated unit concept IDs
records = records.merge(df_params[['Unit', 'unit_concept_id']].drop_duplicates(subset='Unit'), on='Unit', how='left')
# Release memory
del df_v

# Merge the enriched records with MEASUREMENT table on 'subject_id'
df_measure = df_measure.merge(records, on='subject_id', how='left')
# Release memory
del records

# Assign unique MEASUREMENT_IDs to each row
df_measure['MEASUREMENT_ID'] = np.arange(1, len(df_measure) + 1)

# Extract and set the relevant concept and datetime details for each measurement
df_measure['MEASUREMENT_CONCEPT_ID'] = df_measure['concept_id']
base_date = datetime(2011, 1, 1)
df_measure['MEASUREMENT_DATETIME'] = base_date + pd.to_timedelta(df_measure['chart_time'], unit='min')
df_measure['MEASUREMENT_DATE'] = df_measure['MEASUREMENT_DATETIME'].dt.date
df_measure['MEASUREMENT_TYPE_CONCEPT_ID'] = 32817  # EHR as data source
df_measure['OPERATOR_CONCEPT_ID'] = 4172703  # '=' operation

# Handle special cases for 'VALUE_AS_NUMBER' based on specific concept IDs
exceptions = [3003396, 3002032, 3006277, 3012501, 3003129, 3004959, 3007435]
valid_mask = (df_measure['value'] >= 0) | df_measure['concept_id'].isin(exceptions)
df_measure.loc[valid_mask, 'VALUE_AS_NUMBER'] = df_measure['value']
df_measure.loc[~valid_mask, 'VALUE_AS_NUMBER'] = None

# Set the 'UNIT_CONCEPT_ID' values
df_measure['UNIT_CONCEPT_ID'] = df_measure['unit_concept_id']

## Match visit_occurrence_id, visit_detail_id based on chart_time
# Convert 'chart_time' to 'chart_date' to facilitate visit occurrence matching
df_measure['chart_date'] = base_date + pd.to_timedelta(df_measure['chart_time'], unit='min')

# Link measurements to their associated visit occurrences
linked_visits = pd.merge(df_measure[['PERSON_ID', 'chart_date', 'MEASUREMENT_ID']], df_visit_occ, on='PERSON_ID', how='left')
# Filter to include only measurements taken during a visit's duration
linked_visits = linked_visits[(linked_visits['chart_date'] >= linked_visits['VISIT_START_DATETIME']) & 
                              (linked_visits['chart_date'] <= linked_visits['VISIT_END_DATETIME'])]
df_measure['VISIT_OCCURRENCE_ID'] = df_measure.merge(linked_visits[['MEASUREMENT_ID', 'VISIT_OCCURRENCE_ID']], 
                                                     on='MEASUREMENT_ID', how='left', suffixes=('_x', None))['VISIT_OCCURRENCE_ID']


# Link measurements to their associated visit detail
linked_visits = pd.merge(df_measure[['PERSON_ID', 'chart_date', 'MEASUREMENT_ID']], df_visit_detail, on='PERSON_ID', how='left')
# Filter to include only measurements taken during a visit's duration
linked_visits = linked_visits[(linked_visits['chart_date'] >= linked_visits['VISIT_DETAIL_START_DATETIME']) & 
                              (linked_visits['chart_date'] <= linked_visits['VISIT_DETAIL_END_DATETIME'])]
df_measure['VISIT_DETAIL_ID'] = df_measure.merge(linked_visits[['MEASUREMENT_ID', 'VISIT_DETAIL_ID']], 
                                                     on='MEASUREMENT_ID', how='left',suffixes=('_x', None))['VISIT_DETAIL_ID']

del linked_visits

# Set source value columns
df_measure['MEASUREMENT_SOURCE_VALUE'] = df_measure['value']
df_measure['UNIT_SOURCE_VALUE'] = df_measure['Unit']

# Retain only the relevant columns in the final MEASUREMENT table
df_measure = df_measure[columns]

print(time.time() - start)
start = time.time()

# Export the final MEASUREMENT table to CSV
df_measure.to_csv('INSPIRE_ETL/INSPIRE_MEASUREMENT.csv', index=False)

print(time.time() - start)

984.7650737762451
1737.8386561870575


In [9]:
df_measure[~df_measure['VISIT_DETAIL_ID'].isna()]

Unnamed: 0,MEASUREMENT_ID,PERSON_ID,MEASUREMENT_CONCEPT_ID,MEASUREMENT_DATE,MEASUREMENT_DATETIME,MEASUREMENT_TIME,MEASUREMENT_TYPE_CONCEPT_ID,OPERATOR_CONCEPT_ID,VALUE_AS_NUMBER,VALUE_AS_CONCEPT_ID,UNIT_CONCEPT_ID,RANGE_LOW,RANGE_HIGH,PROVIDER_ID,VISIT_OCCURRENCE_ID,VISIT_DETAIL_ID,MEASUREMENT_SOURCE_VALUE,MEASUREMENT_SOURCE_CONCEPT_ID,UNIT_SOURCE_VALUE,VALUE_SOURCE_VALUE
549,550,2,3014111.0,2011-01-03,2011-01-03 15:20:00,,32817,4172703,2.6,,8753.0,,,,2.0,1.0,2.60,,mmol/L,
550,551,2,3014111.0,2011-01-06,2011-01-06 09:05:00,,32817,4172703,1.9,,8753.0,,,,2.0,1.0,1.90,,mmol/L,
552,553,2,3016723.0,2011-01-03,2011-01-03 19:05:00,,32817,4172703,2.76,,8840.0,,,,2.0,1.0,2.76,,mg/dL,
553,554,2,3016723.0,2011-01-03,2011-01-03 23:15:00,,32817,4172703,2.41,,8840.0,,,,2.0,1.0,2.41,,mg/dL,
554,555,2,3016723.0,2011-01-04,2011-01-04 04:20:00,,32817,4172703,2.49,,8840.0,,,,2.0,1.0,2.49,,mg/dL,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127393840,127393841,99893,21490528.0,2011-01-02,2011-01-02 14:40:00,,32817,4172703,34.0,,8554.0,,,,,14209.0,34.00,,%,
127393841,127393842,99893,3024171.0,2011-01-02,2011-01-02 14:40:00,,32817,4172703,12.0,,8541.0,,,,,14209.0,12.00,,/min,
127393842,127393843,99893,3027018.0,2011-01-02,2011-01-02 14:45:00,,32817,4172703,66.0,,8541.0,,,,,14209.0,66.00,,/min,
127393843,127393844,99893,3013502.0,2011-01-02,2011-01-02 14:45:00,,32817,4172703,99.0,,8554.0,,,,,14209.0,99.00,,%,


In [7]:
len(df_measure)

127398829

In [8]:
df_measure[:100]

Unnamed: 0,MEASUREMENT_ID,PERSON_ID,MEASUREMENT_CONCEPT_ID,MEASUREMENT_DATE,MEASUREMENT_DATETIME,MEASUREMENT_TIME,MEASUREMENT_TYPE_CONCEPT_ID,OPERATOR_CONCEPT_ID,VALUE_AS_NUMBER,VALUE_AS_CONCEPT_ID,UNIT_CONCEPT_ID,RANGE_LOW,RANGE_HIGH,PROVIDER_ID,VISIT_OCCURRENCE_ID,VISIT_DETAIL_ID,MEASUREMENT_SOURCE_VALUE,MEASUREMENT_SOURCE_CONCEPT_ID,UNIT_SOURCE_VALUE,VALUE_SOURCE_VALUE
0,1,1,3018677.0,2011-01-01,2011-01-01 15:25:00,,32817,4172703,36.7,,8555.0,,,,1.0,,36.70,,sec,
1,2,1,3034426.0,2011-01-01,2011-01-01 15:25:00,,32817,4172703,1.05,,44818586.0,,,,1.0,,1.05,,INR,
2,3,1,3016407.0,2011-01-01,2011-01-01 15:25:00,,32817,4172703,222.0,,8840.0,,,,1.0,,222.00,,mg/dL,
3,4,1,3000963.0,2011-01-01,2011-01-01 15:25:00,,32817,4172703,8.7,,8713.0,,,,1.0,,8.70,,g/dL,
4,5,1,3009542.0,2011-01-01,2011-01-01 15:25:00,,32817,4172703,30.5,,8554.0,,,,1.0,,30.50,,%,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,1,3020891.0,2011-01-01,2011-01-01 18:40:00,,32817,4172703,28.6,,586323.0,,,,1.0,,28.60,,Celsius,
96,97,1,21490569.0,2011-01-01,2011-01-01 18:40:00,,32817,4172703,19.0,,8876.0,,,,1.0,,19.00,,mmHg,
97,98,1,21490624.0,2011-01-01,2011-01-01 18:40:00,,32817,4172703,3.15,,8554.0,,,,1.0,,3.15,,%,
98,99,1,21490718.0,2011-01-01,2011-01-01 18:40:00,,32817,4172703,8.0,,8698.0,,,,1.0,,8.00,,L/min,


# DEATH

In [13]:
import pandas as pd

# columns of PERSON
df_hira = pd.read_csv('sample/HIRA/HIRA_DEATH_v1.csv')
columns = list(df_hira.columns)
columns

['PERSON_ID',
 'DEATH_DATE',
 'DEATH_DATETIME',
 'DEATH_TYPE_CONCEPT_ID',
 'CAUSE_CONCEPT_ID',
 'CAUSE_SOURCE_VALUE',
 'CAUSE_SOURCE_CONCEPT_ID']

In [14]:
columns = ['PERSON_ID',
 'DEATH_DATE',
 'DEATH_DATETIME',
 'DEATH_TYPE_CONCEPT_ID',
 'CAUSE_CONCEPT_ID',
 'CAUSE_SOURCE_VALUE',
 'CAUSE_SOURCE_CONCEPT_ID']

# Initialize an empty DataFrame for storing DEATH table
df_death = pd.DataFrame(columns=columns)

# Populate 'PERSON_ID' and 'subject_id' columns in the DEATH table from the PERSON table
df_death['PERSON_ID'] = df_person['PERSON_ID']
df_death['subject_id'] = df_person['PERSON_SOURCE_VALUE']

# Merge 'inhosp_death_time' from the operations (df_op) table into the DEATH table using 'subject_id'
df_death = df_death.merge(df_op[['subject_id', 'inhosp_death_time']], on='subject_id', how='left')

# Define the reference date for datetime calculations
base_date = datetime(2011, 1, 1)

# Convert in-hospital death times to actual datetime objects using the base_date as the reference point
df_death['DEATH_DATETIME'] = base_date + pd.to_timedelta(df_death['inhosp_death_time'], unit='min')
df_death['DEATH_DATE'] = df_death['DEATH_DATETIME'].dt.date

# Set the DEATH_TYPE_CONCEPT_ID to represent data sourced from an Electronic Health Record (EHR)
df_death['DEATH_TYPE_CONCEPT_ID'] = 32817

# Retain only the relevant columns in the final DEATH table
df_death = df_death[columns]

# Export the final DEATH table to CSV
df_death.to_csv('INSPIRE_ETL/INSPIRE_DEATH.csv', index=False)

# NOTE

In [37]:
# Define the columns for the NOTE table in the OMOP CDM format
columns = [
    'NOTE_ID', 'PERSON_ID', 'NOTE_DATE', 'NOTE_DATETIME', 'NOTE_TYPE_CONCEPT_ID',
    'NOTE_CLASS_CONCEPT_ID', 'NOTE_TITLE', 'NOTE_TEXT', 'ENCODING_CONCEPT_ID',
    'LANGUAGE_CONCEPT_ID', 'PROVIDER_ID', 'VISIT_OCCURRENCE_ID', 'VISIT_DETAIL_ID',
    'NOTE_SOURCE_VALUE', 'NOTE_EVENT_ID', 'NOTE_EVENT_FIELD_CONCEPT_ID']

# Initialize an empty DataFrame for storing NOTE table
df_note = pd.DataFrame(columns=columns)

# Populate 'PERSON_ID' and 'subject_id' columns in the NOTE table from the PERSON table
df_note['PERSON_ID'] = df_person['PERSON_ID']
df_note['subject_id'] = df_person['PERSON_SOURCE_VALUE']


# residual fields that are not mapped in operation table
res_fields = ['asa', 'emop', 'department', 'antype', 'orin_time', 'orout_time', 'anstart_time', 'anend_time', 'cpbon_time', 'cpboff_time']
res_op = pd.melt(df_op, id_vars=['subject_id', 'opdate'], value_vars=res_fields)
df_note = df_note.merge(res_op, on='subject_id', how='left')
df_note.dropna(subset='value', inplace=True, ignore_index=True)

# Assign unique sequential IDs to the 'NOTE_ID' column
df_note['NOTE_ID'] = np.arange(len(df_note)) + 1

base_date = datetime(2011, 1, 1)
df_note['NOTE_DATETIME'] = base_date + pd.to_timedelta(df_note['opdate'], unit='min')
df_note['NOTE_DATE'] = df_note['NOTE_DATETIME'].dt.date

# Set the NOTE_TYPE_CONCEPT_ID to represent data sourced from an Electronic Health Record (EHR)
df_note['NOTE_TYPE_CONCEPT_ID'] = 32817

# Use the concept id  706617(Anesthesiology) or 706502(Surgical operation).
res_ane = ['asa', 'antype','anstart_time', 'anend_time']
df_note.loc[df_note['variable'].isin(res_ane), 'NOTE_CLASS_CONCEPT_ID'] = 706617
df_note.loc[~df_note['variable'].isin(res_ane), 'NOTE_CLASS_CONCEPT_ID'] = 706502

df_note['NOTE_TITLE'] = df_note['variable']
df_note['NOTE_TEXT'] = df_note['value']

# Use the concept_id 32678(UTF-8)
df_note['ENCODING_CONCEPT_ID'] = 32678

# Use the concept_id 4180186(English language)
df_note['LANGUAGE_CONCEPT_ID'] = 4180186 

df_note = match_visit(df_note, 'NOTE_ID', df_visit_occ, df_visit_detail, on = 'opdate')

# Retain only the relevant columns in the final NOTE table
df_note = df_note[columns]

# Export the final NOTE table to CSV
df_note.to_csv('INSPIRE_ETL/INSPIRE_NOTE.csv', index=False)

In [38]:
df_note

Unnamed: 0,NOTE_ID,PERSON_ID,NOTE_DATE,NOTE_DATETIME,NOTE_TYPE_CONCEPT_ID,NOTE_CLASS_CONCEPT_ID,NOTE_TITLE,NOTE_TEXT,ENCODING_CONCEPT_ID,LANGUAGE_CONCEPT_ID,PROVIDER_ID,VISIT_OCCURRENCE_ID,VISIT_DETAIL_ID,NOTE_SOURCE_VALUE,NOTE_EVENT_ID,NOTE_EVENT_FIELD_CONCEPT_ID
0,1,1,2011-01-01,2011-01-01,32817,706502,emop,1,32678,4180186,,1.0,,,,
1,2,1,2011-01-01,2011-01-01,32817,706502,department,OT,32678,4180186,,1.0,,,,
2,3,1,2011-01-01,2011-01-01,32817,706617,antype,General,32678,4180186,,1.0,,,,
3,4,1,2011-01-01,2011-01-01,32817,706502,orin_time,1110,32678,4180186,,1.0,,,,
4,5,1,2011-01-01,2011-01-01,32817,706502,orout_time,1245,32678,4180186,,1.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1049989,1049950,99900,2011-01-03,2011-01-03,32817,706617,antype,General,32678,4180186,,126754.0,,,,
1049990,1049951,99900,2011-01-03,2011-01-03,32817,706502,orin_time,3355,32678,4180186,,126754.0,,,,
1049991,1049952,99900,2011-01-03,2011-01-03,32817,706502,orout_time,3430,32678,4180186,,126754.0,,,,
1049992,1049953,99900,2011-01-03,2011-01-03,32817,706617,anstart_time,3360.0,32678,4180186,,126754.0,,,,


In [36]:
def match_visit(table, unique_id, df_visit_occ, df_visit_detail, on='chart_time'):
    # Matches visit_occurrence_id and visit_detail_id based on chart_time
    # table: a target table that needs to match visit_ids
    # unique_id: an unique identifier of a table
    # on: the column name for the reference
    
    # Convert 'chart_time' values (in minutes) again for merging with visit occurrences
    table['chart_date'] = base_date + pd.to_timedelta(table[on], unit='min')

    # Match drug exposure dates with visit occurrences based on 'PERSON_ID'
    result = pd.merge(table[['PERSON_ID', 'chart_date', unique_id]], 
                    df_visit_occ[['PERSON_ID', 'VISIT_OCCURRENCE_ID', 'VISIT_START_DATETIME', 'VISIT_END_DATETIME']], 
                      on='PERSON_ID', how='left')
    # Filter results to keep only those rows where 'chart_date' falls within a visit's start and end times
    result = result[(result['chart_date'] >= result['VISIT_START_DATETIME']) & 
                    (result['chart_date'] <= result['VISIT_END_DATETIME'])]

    
    # Merge the filtered results with df_cond_occ to add 'VISIT_OCCURRENCE_ID' details to table
    table = table.merge(result[[unique_id, 'VISIT_OCCURRENCE_ID']], 
                                    on=unique_id, 
                                    how='left', 
                                    suffixes=('_x', None))
    del result

    # Match drug exposure dates with visit occurrences based on 'PERSON_ID'
    result = pd.merge(table[['PERSON_ID', 'chart_date', unique_id]], 
                    df_visit_detail[['PERSON_ID', 'VISIT_DETAIL_ID', 'VISIT_DETAIL_START_DATETIME', 'VISIT_DETAIL_END_DATETIME']], 
                      on='PERSON_ID', how='left')
    # Filter results to keep only those rows where 'chart_date' falls within a visit's start and end times
    result = result[(result['chart_date'] >= result['VISIT_DETAIL_START_DATETIME']) & 
                    (result['chart_date'] <= result['VISIT_DETAIL_END_DATETIME'])]

    # Merge the filtered results with df_cond_occ to add 'VISIT_DETAIL_ID' details to table
    table = table.merge(result[[unique_id, 'VISIT_DETAIL_ID']], 
                                    on=unique_id, 
                                    how='left', 
                                    suffixes=('_x', None))  
    del result
    
    return table

# EDA of data

In [116]:
df_op[df_op['subject_id']==178742874]

Unnamed: 0,op_id,subject_id,hadm_id,case_id,opdate,age,sex,weight,height,race,...,opend_time,admission_time,discharge_time,anstart_time,anend_time,cpbon_time,cpboff_time,icuin_time,icuout_time,inhosp_death_time
0,484069807,178742874,229842382,,0,30,F,48.0,153.0,Asian,...,1230.0,0,7195,1120.0,1235.0,,,,,


In [118]:
df1 = df_vitals[df_vitals['subject_id']==178742874]
df1

Unnamed: 0,op_id,subject_id,chart_time,item_name,value,concept_id,vocab
22468646,484069807,178742874,1120,rr,12.0,3024171.0,LOINC
22468647,484069807,178742874,1125,rr,26.5,3024171.0,LOINC
22468648,484069807,178742874,1130,rr,15.0,3024171.0,LOINC
22468649,484069807,178742874,1135,rr,15.0,3024171.0,LOINC
22468650,484069807,178742874,1140,rr,15.0,3024171.0,LOINC
...,...,...,...,...,...,...,...
62482745,484069807,178742874,1240,etco2,38.0,21490569.0,LOINC
62482746,484069807,178742874,1240,minvol,4.4,42527120.0,LOINC
62482747,484069807,178742874,1240,vt,288.0,21490854.0,LOINC
62482748,484069807,178742874,1240,rr,28.0,3024171.0,LOINC


In [119]:
df1[df1['vocab'] == 'RxNorm']

Unnamed: 0,op_id,subject_id,chart_time,item_name,value,concept_id,vocab
62482396,484069807,178742874,1020,hs,0.0,40166953.0,RxNorm
62482594,484069807,178742874,1180,hs,300.0,40166953.0,RxNorm


In [120]:
df_labs[df_labs['subject_id']==178742874]

Unnamed: 0,subject_id,chart_time,item_name,value,concept_id
10668380,178742874,925,aptt,36.7,3018677
10668381,178742874,925,ptinr,1.05,3034426
10668382,178742874,925,fibrinogen,222.0,3016407
10668383,178742874,925,hb,8.7,3000963
10668384,178742874,925,hct,30.5,3009542
10668385,178742874,925,wbc,5.62,3010813
10668386,178742874,925,platelet,232.0,3007461
10668387,178742874,925,rbc,3.79,3040494
10668388,178742874,925,lymphocyte,27.2,3019198
10668389,178742874,925,sodium,138.0,3019550


# Appendix

In [42]:
df_params

Unnamed: 0,Table,Label,Unit,Description,concept_name,concept_id,vocab,unit_concept_id
0,labs,albumin,g/dL,Albumin,Albumin [Mass/volume] in Serum or Plasma,3024561,LOINC,8713.0
1,labs,alp,IU/L,Alkaline phosphatase,Alkaline phosphatase [Enzymatic activity/volum...,3035995,LOINC,8923.0
2,labs,alt,IU/L,Alanine transaminase,Alanine aminotransferase [Enzymatic activity/v...,3006923,LOINC,8923.0
3,labs,aptt,sec,Activated partial thromboplastin time,aPTT in Platelet poor plasma by Coagulation assay,3018677,LOINC,8555.0
4,labs,ast,IU/L,Aspartate transaminase,Aspartate aminotransferase [Enzymatic activity...,3013721,LOINC,8923.0
...,...,...,...,...,...,...,...,...
115,ward_vitals,nibp_mbp,mmHg,Non-invasive mean blood pressure,Mean blood pressure by Noninvasive,21492241,LOINC,8876.0
116,ward_vitals,nibp_sbp,mmHg,Non-invasive systolic blood pressure,Systolic blood pressure by Noninvasive,21492239,LOINC,8876.0
117,ward_vitals,rr,/min,Respiration rate,Respiratory rate,3024171,LOINC,8541.0
118,ward_vitals,spo2,%,Peripheral oxygen saturation,Oxygen saturation in Blood,3013502,LOINC,8554.0


## measurement

In [46]:
m_concepts = df_measure['MEASUREMENT_CONCEPT_ID'].values

In [65]:
df = pd.DataFrame({'concept_id': m_concepts})

df = df.merge(df_params[['concept_id', 'Table', 'Label', 'Description']], on='concept_id', how='left')

In [67]:
df1 = df[df['Table']=='labs']
(df1['Label'].value_counts() / len(df) * 100).to_csv('labs_count.csv')

df1 = df[df['Table']=='vitals']
(df1['Label'].value_counts() / len(df) * 100).to_csv('vitals_count.csv')

df1 = df[df['Table']=='ward_vitals']
(df1['Label'].value_counts() / len(df) * 100).to_csv('ward-vitals_count.csv')

In [63]:
(df['Label'].value_counts() / len(df) * 100).to_csv('params_count.csv')

In [58]:
df['Label'].value_counts()

Label
hr            29309356
rr            23218476
nibp_sbp      17755264
nibp_dbp      17754254
spo2          17466000
                ...   
pc                5009
troponin_t         488
d_dimer            317
pheresis           135
etiso                5
Name: count, Length: 83, dtype: int64

In [None]:
# Medi_counts
drugs_per_subject = df_medi_mapped.drop_duplicates(subset=['subject_id', 'drug_name'])
drug_counts = drugs_per_subject['drug_name'].value_counts().to_frame()
drug_counts.reset_index(inplace=True)

drug_counts['perc'] = drug_counts['count'] / 99900 * 100
drug_counts['mismatch'] = np.where(drug_counts['drug_name'].isin(df_mismatch['drug_name']), 1, np.nan)
drug_counts.to_csv('results/medi_counts.csv', index=False)
drug_counts

In [48]:
element_counts = {}
for item in df['Label']:
    if item in element_counts:
        element_counts[item] += 1
    else:
        element_counts[item] = 1

print(element_counts)

{3018677.0: 335644, 3034426.0: 384883, 3016407.0: 279062, 3000963.0: 886026, 3009542.0: 1043957, 3010813.0: 812463, 3007461.0: 797465, 3040494.0: 861561, 3002030.0: 755610, 3019550.0: 1002519, 3023103.0: 1005922, 3014576.0: 769319, 3013682.0: 735447, 3016723.0: 1456067, 3013721.0: 667302, 3006923.0: 667574, 3011904.0: 723416, 3020630.0: 657604, 3024561.0: 710169, 3035995.0: 652247, 3024171.0: 11609238, 21490718.0: 316036, 21490716.0: 262851, 21490634.0: 140322, 21492239.0: 8877632, 21492240.0: 8877127, 21492241.0: 4037275, 3020891.0: 8393042, 3027018.0: 14654678, 3013502.0: 8733000, 21490569.0: 3781409, 21490624.0: 1457438, 42527120.0: 3241996, 21490854.0: 3293827, 21490650.0: 3436921, 21490528.0: 3612297, 3014111.0: 235385, 3004410.0: 45367, 3006906.0: 961487, 3043744.0: 683499, 3004501.0: 1483749, 3019977.0: 354407, 3027946.0: 353186, 3027801.0: 353435, 3008152.0: 409909, 3016502.0: 410824, 3003396.0: 258351, 3007220.0: 66549, 3005785.0: 68998, 3021337.0: 55519, 3020460.0: 465703, 30

## medication

In [73]:
df = df_medi['drug_name'].value_counts()
df.to_frame()

Unnamed: 0_level_0,count
drug_name,Unnamed: 1_level_1
ambroxol,773924
famotidine,396963
magnesium oxide,315366
cefazolin,238256
acetylcysteine,205115
...,...
estradiol,1
fulvestrant,1
clobetasol,1
mepivacaine,1


In [75]:
df = df_medi['drug_name'].value_counts().to_frame()
df['perc'] = df['count'] / len(df_medi) * 100
df.to_csv('results/inspire_medi_counts.csv')

In [83]:
df_medi

Unnamed: 0,subject_id,chart_time,drug_name,route,concept_name,concept_id
0,117512122,2832985,pregabalin,po,pregabalin,734354
24,136795633,2305320,pregabalin,po,pregabalin,734354
25,167150933,2447040,pregabalin,po,pregabalin,734354
198,147607600,79260,pregabalin,po,pregabalin,734354
315,188223291,140160,pregabalin,po,pregabalin,734354
...,...,...,...,...,...,...
6952774,116930432,1320,diazoxide,po,diazoxide,1523280
6952775,197368441,357665,nintedanib,po,nintedanib,45775396
6952934,172516290,507720,carbomer,ex,carbomer,19055217
6952935,179336344,9015,dolutegravir,po,dolutegravir,43560385


In [82]:
drugs_per_subject = df_medi.drop_duplicates(subset=['subject_id', 'drug_name'])
drug_counts = drugs_per_subject['drug_name'].value_counts()
drug_counts

TypeError: 'NoneType' object is not subscriptable

In [79]:
df_mismatch = df_medi[df_medi['concept_id'].isna()]

df_mismatch.drop_duplicates(subset=['subject_id', 'drug_name'])

df_mismatch['drug_name'].value_counts()
#df_mismatch['drug_name'].value_counts().to_csv('medi_mismatch.csv')

Series([], Name: count, dtype: int64)

# Final
* person table에 subject_id 제거