# Loading MOVER SIS_EMR dataset

In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime

# MOVER (about 39,,685 patients, 64,354 cases)
# Define the path to the INSPIRE v2 dataset
input_path = 'mover/SIS_EMR'

# Load the source tables within INSPIRE v2 into dataframes
df_info = pd.read_csv(f'{input_path}/patient_information.csv')         # Load information data
df_medi = pd.read_csv(f'{input_path}/patient_medication.csv')       # Load medications data
df_labs = pd.read_csv(f'{input_path}/patient_labs.csv')       # Load labs data
df_io = pd.read_csv(f'{input_path}/patient_input_output.csv')       # Load input_output data
df_vitals = pd.read_csv(f'{input_path}/patient_vitals.csv', on_bad_lines='skip')          # Load  vitals data
df_obs = pd.read_csv(f'{input_path}/patient_observations.csv', on_bad_lines='skip')    # Load observations table
df_vent = pd.read_csv(f'{input_path}/patient_ventilator.csv', on_bad_lines='skip')
df_aline = pd.read_csv(f'{input_path}/patient_a_line.csv', on_bad_lines='skip')
df_events = pd.read_csv(f'{input_path}/patient_procedure_events.csv', on_bad_lines='skip')          # Load vitals data


# Display the number of records in each dataset
print(f'Size of the tables: information {len(df_info)}, medication {len(df_medi)},\
      labs {len(df_labs)}, input_output {len(df_io)}, vitals {len(df_vitals)}, observations {len(df_obs)},\
      ventilator {len(df_vent)}, aline {len(df_aline)}, events {len(df_events)}')

# Combine all the subject_ids from the loaded datasets
subject_ids = df_io['PID'].tolist() + df_labs['PID'].tolist() + df_medi['PID'].tolist() + df_info['PID'].tolist() + df_obs['PID'].tolist() + df_events['PID'].tolist() + df_vent['PID'].tolist() + df_aline['PID'].tolist()

# Display the total unique subjects present in the combined dataset
print(f'total subjects in MOVER dataset: {len(np.unique(subject_ids))}')
print(f"total subjects in patient_information.csv: {len(np.unique(df_info['PID']))}")

  df_obs = pd.read_csv(f'{input_path}/patient_observations.csv', on_bad_lines='skip')    # Load observations table


Size of the tables: information 19114, medication 373852,      labs 14733, input_output 100993, vitals 3847548, observations 3663066,      ventilator 1048575, aline 2989, events 40801
total subjects in MOVER dataset: 19158
total subjects in patient_information.csv: 19114


In [36]:
# Load Athena concept, concept_relationship table
df_concept_rel = pd.read_csv(f'vocab/CONCEPT_RELATIONSHIP.csv', sep='\t', on_bad_lines='error')
df_concept = pd.read_csv(f'vocab/CONCEPT.csv', sep='\t')

  df_concept = pd.read_csv(f'vocab/CONCEPT.csv', sep='\t')


In [17]:
df_info = df_info.drop_duplicates().reset_index(drop=True)
df_io = df_io.drop_duplicates().reset_index(drop=True)
df_medi = df_medi.drop_duplicates().reset_index(drop=True)
df_vitals = df_vitals.drop_duplicates().reset_index(drop=True)
df_events = df_events.drop_duplicates().reset_index(drop=True)
df_labs = df_labs.drop_duplicates().reset_index(drop=True)
df_obs = df_obs.drop_duplicates().reset_index(drop=True)
df_vent = df_vent.drop_duplicates().reset_index(drop=True)
df_aline = df_aline.drop_duplicates().reset_index(drop=True)

# Display the number of records in each dataset
print(f'Size of the tables: information {len(df_info)}, medication {len(df_medi)},\
      labs {len(df_labs)}, input_output {len(df_io)}, vitals {len(df_vitals)}, observations {len(df_obs)},\
      ventilator {len(df_vent)}, aline {len(df_aline)}, events {len(df_events)}')


Size of the tables: information 19114, medication 373616,      labs 14733, input_output 100967, vitals 3595596, observations 2684312,      ventilator 1048575, aline 2951, events 40697


In [24]:
### patient_information table
# Convert the datetime in string into datetime64[ns] type
df_info['OR_start'] = pd.to_datetime(df_info['OR_start'], format='%m/%d/%y %H:%M', errors='raise')
df_info['OR_end'] = pd.to_datetime(df_info['OR_end'], format='%m/%d/%y %H:%M', errors='raise')
df_info['Surgery_start'] = pd.to_datetime(df_info['Surgery_start'], format='%m/%d/%y %H:%M', errors='raise')
df_info['Surgery_end'] = pd.to_datetime(df_info['Surgery_end'], format='%m/%d/%y %H:%M', errors='raise')

# Change string into numeric value for height and weight
df_info['Ht'] = pd.to_numeric(df_info['Ht'], errors='coerce')
df_info['Wt'] = pd.to_numeric(df_info['Wt'], errors='coerce')

df_info.to_parquet(f'{input_path}/parquet/patient_information.parquet', index=False)

In [53]:
### patient medication table
# Regular expression to extract the string before a number or '/'
df_medi['ingredient'] = df_medi['Drug_name'].str.extract(r'([^0-9/]+)')
# Lower cases and strip whitespace
df_medi['ingredient'] = df_medi['ingredient'].str.lower().str.strip()

# Split ingredients that are coupled by '-'
df_medi = df_medi.assign(ingredient=df_medi['ingredient'].str.split('-')).explode('ingredient').reset_index(drop=True)
df_medi['ingredient'] = df_medi['ingredient'].str.strip()

# Matches the ingredient names with 1, 2, 3 words
df_medi['ingredients'] = df_medi['ingredient'].str.split()
df_medi['ingredient3'] = df_medi['ingredients'].apply(lambda x: ' '.join(x[:3])).str.strip()
df_medi['ingredient2'] = df_medi['ingredients'].apply(lambda x: ' '.join(x[:2])).str.strip()
df_medi['ingredient1'] = df_medi['ingredients'].apply(lambda x: ' '.join(x[:1])).str.strip()

# Map the source concept of drugs into standard concept (RxNorm)
rxnorm_concepts = df_concept[((df_concept['vocabulary_id'] == 'RxNorm') | (df_concept['vocabulary_id'] == 'RxNorm Extension')) & (df_concept['standard_concept'] == 'S')][['concept_name', 'concept_id']]
rxnorm_concepts['concept_name'] = rxnorm_concepts['concept_name'].str.lower()

df_medi['concept_id3'] = df_medi.merge(rxnorm_concepts, left_on='ingredient3', right_on='concept_name', how='left')['concept_id']
df_medi['concept_id2'] = df_medi.merge(rxnorm_concepts, left_on='ingredient2', right_on='concept_name', how='left')['concept_id']
df_medi['concept_id1'] = df_medi.merge(rxnorm_concepts, left_on='ingredient1', right_on='concept_name', how='left')['concept_id']

df_medi['concept_id'] = df_medi[['concept_id3', 'concept_id2', 'concept_id1']].bfill(axis=1).iloc[:, 0]
df_medi['concept_id'] = df_medi['concept_id'].astype('Int64')

# Calculate and print the number of MEDICATIONS table records that couldn't be mapped to a standard concept
nan_sum = df_medi['concept_id'].isna().sum()
print(f'mismatched concepts in MEDICATIONS table: {nan_sum} / {len(df_medi)} ({nan_sum/len(df_medi)*100:.1f}%)')

#df_medi.dropna(columns='concept_id', inplace=True)
               
df_medi.to_parquet(f'{input_path}/parquet/patient_medications.parquet')

mismatched concepts in MEDICATIONS table: 94071 / 415736 (22.6%)


In [55]:
df_medi.drop(columns='concept_id', inplace=True)

### patient medication table
# Regular expression to extract the string before a number or '/'
df_medi['ingredient'] = df_medi['Drug_name'].str.extract(r'([^0-9/]+)')
# Lower cases and strip whitespace
df_medi['ingredient'] = df_medi['ingredient'].str.lower().str.strip()

# Split ingredients that are coupled by '-'
df_medi = df_medi.assign(ingredient=df_medi['ingredient'].str.split('-')).explode('ingredient').reset_index(drop=True)
df_medi['ingredient'] = df_medi['ingredient'].str.strip()

# Map the source concept of drugs into standard concept (RxNorm)
drug_concept = df_concept[((df_concept['vocabulary_id'] == 'RxNorm') | (df_concept['vocabulary_id'] == 'RxNorm Extension')) & (df_concept['standard_concept'] == 'S')][['concept_name', 'concept_id']]
drug_concept['concept_name'] = drug_concept['concept_name'].str.lower()

df_medi['concept_id'] = df_medi.merge(drug_concept, left_on='ingredient', right_on='concept_name', how='left')['concept_id']

# Calculate and print the number of MEDICATIONS table records that couldn't be mapped to a standard concept
nan_sum = df_medi['concept_id'].isna().sum()

#df_medi.dropna(columns='

print(f'mismatched concepts in MEDICATIONS table: {nan_sum} / {len(df_medi)} ({nan_sum/len(df_medi)*100:.1f}%)')


mismatched concepts in MEDICATIONS table: 133241 / 460664 (28.9%)


* medi_counts

In [58]:
# Medi_counts
drugs_per_subject = df_medi.drop_duplicates(subset=['PID', 'ingredient'])
drug_counts = drugs_per_subject['ingredient'].value_counts().to_frame()
drug_counts.reset_index(inplace=True)
# 문제는 ingredient 하나에 여러개 medication이 대응될 수 있음 (medication을 split해서 ingredient를 만들어서, 원래 ingredient 하나 자체가 medication인 경우와 중복)
drug_counts['medication'] = drug_counts.merge(drugs_per_subject.drop_duplicates(['ingredient']), on='ingredient', how='left')['Drug_name']

# Map the source concept of drugs into standard concept (RxNorm)
rxnorm_concepts = df_concept[((df_concept['vocabulary_id'] == 'RxNorm') | (df_concept['vocabulary_id'] == 'RxNorm Extension')) & (df_concept['standard_concept'] == 'S')][['concept_name', 'concept_id']]
rxnorm_concepts['concept_name'] = rxnorm_concepts['concept_name'].str.lower()

drug_mapped = drug_counts.merge(rxnorm_concepts, left_on='ingredient', right_on='concept_name', how='left')

drug_mismatch = drug_mapped[drug_mapped['concept_id'].isna()]

drug_counts['perc'] = drug_counts['count'] / len(df_medi['PID'].unique()) * 100
#drug_counts['mismatch'] = np.where(drug_counts['ingredient'].isin(drug_mismatch['ingredient']), 1, np.nan)
#drug_counts.to_csv('results/mover_medi_counts.csv', index=False)

# Remove rows that have empty ingredient
drug_counts = drug_counts[drug_counts['ingredient']!='']

# Add an auxiliary column to maintain the order
drug_counts['order'] = range(len(drug_counts))

# Step 1: Extract the first word of 'ingredient' if it has more than one word
drug_counts['first_word'] = drug_counts['ingredient'].str.split().str[0]

# Step 2: Merge with rxnorm_concepts on 'ingredient' for exact matches
exact_matches = pd.merge(drug_counts, rxnorm_concepts, left_on='ingredient', right_on='concept_name', how='left')

# Step 3: Identify which entries didn't get a match
non_matches = drug_counts[~drug_counts['ingredient'].isin(exact_matches['concept_name'])]
exact_matches.dropna(subset='concept_id', inplace=True)

# Step 4: Merge the non-matches with the 'concept_name' based on 'first_word'
partial_matches = pd.merge(non_matches, rxnorm_concepts, left_on='first_word', right_on='concept_name', how='left')

# Step 5: Combine the exact matches with the partial matches
combined_matches = pd.concat([exact_matches, partial_matches]).sort_values(by='order')

# Step 6: Drop the temporary 'first_word' columns and any duplicates that might have arisen
final_df = combined_matches.drop_duplicates(['first_word', 'order']).drop(columns=['first_word', 'order'])

# Check mismatch
final_df['mismatch'] = (final_df['concept_id'].isna()).astype(int)

final_df.to_csv('results/mover_sis_medi_counts.csv', index=False)

In [None]:
medi_mapped = pd.read_csv('results/mover_sis_medi_counts+manual.csv')


In [33]:
df_labs

Unnamed: 0,PID,Obs_time,Na,K,Ca,Gluc,Ph,PCO2,PO2,BE,HCO3,HgB
0,aca232a2eb2d82af,2018-08-13 13:40:00,\N,\N,\N,\N,7.31,42,\N,\N,\N,\N
1,aca232a2eb2d82af,2018-08-13 13:41:00,142,2.6,4.4,101,\N,\N,98,\N,21,11.7
2,aca232a2eb2d82af,2018-08-13 14:19:00,\N,\N,\N,\N,7.33,42,89,\N,22,\N
3,aca232a2eb2d82af,2018-08-13 14:20:00,141,2.7,4.3,103,\N,\N,\N,\N,\N,10.6
4,aca232a2eb2d82af,2018-08-13 15:44:00,140,3.3,4,102,7.34,37,99,\N,20,10.6
...,...,...,...,...,...,...,...,...,...,...,...,...
14728,4a0edb6dd14cfed8,2016-05-04 09:38:00,136,3.8,6,239,\N,\N,109,\N,21.6,10.1
14729,4a0edb6dd14cfed8,2016-05-04 10:48:00,137,3.4,5.9,274,7.35,39.8,185,\N,21.7,10.1
14730,4a0edb6dd14cfed8,2016-05-04 12:23:00,137,3.5,5.8,292,7.34,41.2,229,\N,22.3,\N
14731,4a0edb6dd14cfed8,2016-05-04 12:24:00,\N,\N,\N,\N,\N,\N,\N,\N,\N,9.7


In [5]:
df_io

Unnamed: 0,PID,Type,Volume,IO_datetime
0,aca232a2eb2d82af,Urine,-40,2018-08-13 08:04:00
1,aca232a2eb2d82af,Urine,0,2018-08-13 11:01:00
2,aca232a2eb2d82af,Urine,0,2018-08-13 11:59:00
3,aca232a2eb2d82af,Urine,-5,2018-08-13 12:59:00
4,aca232a2eb2d82af,Urine,-40,2018-08-13 15:30:00
...,...,...,...,...
100988,0ec274d3b559cb59,Plasmalyte,300,2016-06-24 14:42:00
100989,3464694e5045572d,Urine,0,2016-11-01 09:16:00
100990,3464694e5045572d,Ebl,0,2016-11-01 09:15:00
100991,3464694e5045572d,Lactated ringers,500,2016-11-01 08:04:00


In [11]:
df_vitals

Unnamed: 0,PID,Obs_time,HRe,HRp,nSBP,nMAP,nDBP,SP02
0,61d41071aed1b140,2017-12-16 15:20:00,\N,\N,\N,\N,\N,\N
1,61d41071aed1b140,2017-12-16 15:21:00,85,85,110,80,65,100
2,61d41071aed1b140,2017-12-16 15:22:00,80,80,\N,\N,\N,100
3,61d41071aed1b140,2017-12-16 15:23:00,91,90,\N,\N,\N,100
4,61d41071aed1b140,2017-12-16 15:24:00,76,84,102,77,65,100
...,...,...,...,...,...,...,...,...
3847543,001448a1d86bf1ee,2017-05-17 08:59:00,\N,\N,\N,\N,\N,\N
3847544,001448a1d86bf1ee,2017-05-17 09:00:00,\N,\N,\N,\N,\N,\N
3847545,001448a1d86bf1ee,2017-05-17 09:01:00,\N,\N,\N,\N,\N,\N
3847546,001448a1d86bf1ee,2017-05-17 09:07:00,\N,\N,117,\N,\N,\N


In [12]:
df_obs

Unnamed: 0,PID,Obs_time,SVV,PAPs,PAPm,PAPd,LAPm,CO,Cer_ox_r,Cer_ox_l,...,Temp2,TOF,SBP_FEM,MAP_FEM,DBP_FEM,ICPm,SBP_ART,MAP_ART,DBP_ART,CVPm
0,29b49d461c89934d,2017-08-15 08:43:00,16.0,,,,,,,,...,\N,\N,,,,,97,55,46,284.0
1,cc2953f1644ddbf1,2017-10-14 09:02:00,6.0,,,,,,,,...,\N,\N,,,,,103,72,52,
2,cc2953f1644ddbf1,2017-10-14 10:07:00,10.0,,,,,,,,...,\N,\N,,,,,90,69,49,
3,cc2953f1644ddbf1,2017-10-14 10:22:00,8.0,,,,,,,,...,\N,\N,,,,,113,75,57,
4,1c46ba7836e46ed4,2017-06-27 16:31:00,37.0,,,,,,,,...,,,,,,,85,61,52,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3663061,3b965584614e30f5,2016-07-17 16:07:00,,,,,,,,,...,,,,,,,,,,234.0
3663062,1dad65d78bca7a9d,2016-03-14 09:25:00,,,,,,,,,...,,,,,,,,,,17.0
3663063,1dad65d78bca7a9d,2016-03-14 10:55:00,,,,,,,,,...,,,,,,,,,,19.0
3663064,a3875a9b6b569a0a,2016-04-15 17:18:00,,,,,,,,,...,,,,,,,,,,24.0


# Patient