Permalink
Please sign in to comment.
Showing
with
680 additions
and 390 deletions.
- +47 −0 buildmimic/oracle/README.md
- +100 −0 buildmimic/oracle/add_oracle_rowdelimiter.py
- +72 −0 buildmimic/oracle/build_mimic_oracle.sh
- +4 −4 buildmimic/oracle/controlfiles/noteevents.ctl
- +28 −0 buildmimic/oracle/controlfiles/noteevents_output.ctl
- +76 −21 buildmimic/oracle/oracle_add_constraints.sql
- +1 −1 buildmimic/oracle/oracle_add_indexes.sql
- +352 −352 buildmimic/oracle/oracle_create_tables.sql
- +0 −12 buildmimic/oracle/oracle_load_data.md
| @@ -0,0 +1,47 @@ | ||
| +# Loading data in Oracle | ||
| + | ||
| +This folder contains scripts for loading the data into an Oracle database system. | ||
| +There are three steps to loading the data into Oracle: | ||
| + | ||
| +1. The tables are created using an SQL script | ||
| +2. `sqlldr` is used to load data into these tables (either individually or with a shell script) | ||
| +3. Indices and constraints are added using an SQL script | ||
| + | ||
| +The largest challenge in loading in data in Oracle is NOTEEVENTS. Oracle cannot handle reasonably standard formatted CSVs: instead it requires a special character to delineate a newline from a new row (most database systems can handle newlines if they are quoted, which is how the data for MIMIC-III is distributed). As a result, we have provided an additional Python script which will run through NOTEEVENTS and append a unique string of characters to the end of each row. The script can be called as follows: | ||
| + | ||
| +```python | ||
| +python add_oracle_rowdelimiter.py -d ',' -i 'NOTEEVENTS.csv' -r '><><?~`;;`' | ||
| +``` | ||
| + | ||
| +The function will output a file called `NOTEEVENTS_output.csv`. You may then use the noteevents_output.ctl file to load in the data, e.g. where the tables are on the MIMICIII user's schema: | ||
| + | ||
| +```sqlldr '\ as SYSDBA' control=noteevents_output.ctl log=noteevents_output.log``` | ||
| + | ||
| +# Step by step guide for loading data | ||
| + | ||
| +A shell script called `build_mimic_oracle.sh` is provided to build MIMIC on an Oracle database, but note you will likely need to customize this script for your own purposes, including: | ||
| + | ||
| +* modifying the default schema | ||
| +* changing your authentication | ||
| + | ||
| +Alternatively, you can follow this step by step guide: | ||
| + | ||
| +1. Download the MIMIC-III data and extract it to a working folder | ||
| +2. Add the control files, SQL scripts, and python script to the working folder | ||
| +3. Open up either `sqlplus` or SQLDeveloper | ||
| +4. Convert NOTEEVENTS.csv into a workable format: `python add_oracle_rowdelimiter.py -d ',' -i 'NOTEEVENTS.csv' -r '><><?~`;;`'` | ||
| +5. Run `oracle_create_tables.sql` | ||
| +6. Add permissions to run the bash script: `chmod a+x load_data_oracle.sh` | ||
| +7. Run the bash script: `./load_data_oracle.sh` | ||
| + * It helps to be logged in as the Oracle user, as the script uses OS authentication to speed things up | ||
| + * If you do not know how to use OS authentication, remove '/ as SYSDBA' from the beginning of each line in the script | ||
| +8. Run the `oracle_add_indexes.sql` and `oracle_add_constrants.sql` files | ||
| + | ||
| +These scripts have yet to be fully tested and we would welcome pull requests, bug reports, or suggestions via GitHub. | ||
| + | ||
| +# Load time | ||
| + | ||
| +Load time for chartevents (the largest table) with partitions is ~30min | ||
| + | ||
| +Elapsed time was: 00:33:46.44 | ||
| +CPU time was: 00:28:30.03 |
| @@ -0,0 +1,100 @@ | ||
| +from __future__ import print_function | ||
| + | ||
| +import sys | ||
| +import getopt | ||
| +import os.path | ||
| + | ||
| +import csv | ||
| + | ||
| +# This function takes a CSV file and appends a string to the end of each row | ||
| +# This facilitates using the CSV with programs which can't handle newlines in fields | ||
| +# For example, Oracle's SQLLDR requires a unique string at the end of each row to indicate the row delimiter. | ||
| + | ||
| +# FUNCTION ASSUMPTIONS: | ||
| +# 1) file is in proper CSV format, where "proper" is defined as: | ||
| +# comma delimited | ||
| +# if a string contains a comma, it is double quoted | ||
| +# if a string contains a newline, it is double quoted | ||
| +# double quotes occurring within a string are escaped by another double quote | ||
| +# 2) file does *not* have a header row | ||
| + | ||
| +def main(argv): | ||
| + """ | ||
| + Run `Remove newlines` from a CSV file. | ||
| + | ||
| + Arguments | ||
| + ---------- | ||
| + -h: print help | ||
| + -i: str | ||
| + Absolute path to a valid CSV file. | ||
| + -d: str | ||
| + Delimiter (','). For delimiters with special characters, quote the delimiter in apostrophes. | ||
| + """ | ||
| + # parse input arguments | ||
| + fn_in='' | ||
| + delimiter='' | ||
| + fn_sql='' | ||
| + | ||
| + output_type = 'oracle' | ||
| + newline_char = '\n' | ||
| + oracle_newrow = '><><?~`;;`' | ||
| + | ||
| + # 'oracle' - delete newlines, replace with spaces. | ||
| + try: | ||
| + opts, args = getopt.getopt(argv,"hi:d:c:o",["ifile=","delimiter=","ctl="]) | ||
| + except getopt.GetoptError: | ||
| + print('remove_newlines.py -i <input_file> -d <delimiter> -r <row_delimiter>') | ||
| + sys.exit(2) | ||
| + for opt, arg in opts: | ||
| + if opt == '-h': | ||
| + print('remove_newlines.py -i <input_file> -d <delimiter> -r <row_delimiter>') | ||
| + sys.exit() | ||
| + elif opt in ("-i", "--ifile"): | ||
| + fn_in = arg | ||
| + elif opt in ("-d", "--delimiter"): | ||
| + delimiter = arg | ||
| + elif opt in ("-r", "--row-delimiter"): | ||
| + oracle_newrow = arg | ||
| + | ||
| + # input argument checking | ||
| + if os.path.isfile(fn_in) == 1: | ||
| + print('Using input file {}'.format(fn_in)) | ||
| + else: | ||
| + print('Cannot find input file {}'.format(fn_in)) | ||
| + sys.exit(2) | ||
| + | ||
| + fn_out=fn_in.strip('.csv')+'_output.csv' | ||
| + print('\n'+'~'*40) | ||
| + print('Input filename = {}'.format(fn_in)) | ||
| + print('Delimiter = {}'.format(delimiter)) | ||
| + print('New row character(s) = {}'.format(oracle_newrow)) | ||
| + print('Output filename = {}'.format(fn_out)) | ||
| + print('Please note all output fields will be double quoted.') | ||
| + print('~'*40+'\n') | ||
| + #raw_input('Press any key to continue.') | ||
| + | ||
| + with open(fn_in , 'rb') as input_file: | ||
| + reader = csv.reader(input_file, delimiter=',', | ||
| + doublequote=True, | ||
| + quoting=csv.QUOTE_MINIMAL) | ||
| + # QUOTE_NONNUMERIC doesn't work because it tries to convert dates are floats | ||
| + # consequently, all fields are output quoted | ||
| + # not a big deal for oracle, which has optionally enclosed by double quotes parameter | ||
| + | ||
| + with open(fn_out,'wb') as fout: | ||
| + out = csv.writer(fout, doublequote=True, quoting=csv.QUOTE_NONNUMERIC, | ||
| + lineterminator=oracle_newrow + '\r\n') | ||
| + for row in reader: | ||
| + out.writerow(row) | ||
| + if reader.line_num % 100000 == 0: | ||
| + print('Finished {} million lines.'.format(reader.line_num / 1000000)) | ||
| + | ||
| + # Summarise output | ||
| + print('\n'+'~'*40) | ||
| + print('Merging complete\n') | ||
| + print('Number of rows processed: {}'.format(reader.line_num)) | ||
| + print('New file created: {}'.format(fout.name)) | ||
| + print('~'*40) | ||
| + | ||
| +if __name__ == "__main__": | ||
| + main(sys.argv[1:]) |
| @@ -0,0 +1,72 @@ | ||
| +#!/bin/bash | ||
| + | ||
| +# This script attempts to build MIMIC on an Oracle instance. | ||
| +# You will likely need to modify it to fit your own system, for example, | ||
| +# you may need to change the authentication, e.g. replace '\ as SYSDBA' with 'myusername/mypassword' | ||
| +# The script requires sqlldr and sqlplus | ||
| + | ||
| + | ||
| +# Create the tables | ||
| + | ||
| +sqlplus '\ as SYSDBA' << EOF | ||
| +WHENEVER OSERROR EXIT 9; | ||
| +WHENEVER SQLERROR EXIT SQL.SQLCODE; | ||
| + | ||
| +ALTER SESSION SET CURRENT_SCHEMA = MIMICIII; | ||
| +@oracle_create_tables.sql | ||
| + | ||
| +EOF | ||
| + | ||
| +# Alternatively, you could specify a username/password here, and use the below snippet | ||
| + | ||
| +#db_username= | ||
| +#db_password= | ||
| + | ||
| +#sqlplus -s /nolog << EOF | ||
| +#WHENEVER OSERROR EXIT 9; | ||
| +#WHENEVER SQLERROR EXIT SQL.SQLCODE; | ||
| +#CONNECT ${db_username}/${db_password}; | ||
| +#@oracle_create_tables.sql | ||
| +#EOF | ||
| + | ||
| + | ||
| +# Call sqlldr to load the data | ||
| + | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/admissions.ctl' log=admissions.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/callout.ctl' log=callout.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/caregivers.ctl' log=caregivers.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/chartevents.ctl' log=chartevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/cptevents.ctl' log=cptevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/d_cpt.ctl' log=d_cpt.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/d_icd_diagnoses.ctl' log=d_icd_diagnoses.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/d_icd_procedures.ctl' log=d_icd_procedures.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/d_items.ctl' log=d_items.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/d_labitems.ctl' log=d_labitems.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/datetimeevents.ctl' log=datetimeevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/diagnoses_icd.ctl' log=diagnoses_icd.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/drgcodes.ctl' log=drgcodes.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/icustays.ctl' log=icustays.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/inputevents_cv.ctl' log=inputevents_cv.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/inputevents_mv.ctl' log=inputevents_mv.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/labevents.ctl' log=labevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/microbiologyevents.ctl' log=microbiologyevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/noteevents_output.ctl' log=noteevents_output.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/outputevents.ctl' log=outputevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/patients.ctl' log=patients.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/prescriptions.ctl' log=prescriptions.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/procedureevents_mv.ctl' log=procedureevents_mv.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/procedures_icd.ctl' log=procedures_icd.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/services.ctl' log=services.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| +sqlldr '\ as SYSDBA' control='controlfiles/transfers.ctl' log=transfers.log parallel=true direct=true multithreading=true skip_index_maintenance=true | ||
| + | ||
| +# Now, create the indexes and constraints | ||
| + | ||
| +sqlplus '\ as SYSDBA' << EOF | ||
| +WHENEVER OSERROR EXIT 9; | ||
| +WHENEVER SQLERROR EXIT SQL.SQLCODE; | ||
| + | ||
| +ALTER SESSION SET CURRENT_SCHEMA = MIMICIII; | ||
| +@oracle_add_indexes.sql | ||
| +@oracle_add_constraints.sql | ||
| + | ||
| +EOF |
| @@ -0,0 +1,28 @@ | ||
| +OPTIONS ( | ||
| +skip=1, | ||
| +errors=0, | ||
| +direct=true, | ||
| +multithreading=true | ||
| +) | ||
| +LOAD DATA | ||
| +INFILE 'NOTEEVENTS.csv' "str '\n'><><?~`;;`" | ||
| +BADFILE 'logfile.bad' | ||
| +DISCARDFILE 'logfile.discard' | ||
| +APPEND | ||
| +INTO TABLE noteevents | ||
| +FIELDS TERMINATED BY ',' | ||
| +OPTIONALLY ENCLOSED BY '"' AND '"' | ||
| +TRAILING nullcols | ||
| +( | ||
| +ROW_ID, | ||
| +SUBJECT_ID, | ||
| +HADM_ID, | ||
| +CHARTDATE DATE "YYYY-MM-DD HH24:MI:SS", | ||
| +CHARTTIME DATE "YYYY-MM-DD HH24:MI:SS", | ||
| +STORETIME DATE "YYYY-MM-DD HH24:MI:SS", | ||
| +CATEGORY, | ||
| +DESCRIPTION, | ||
| +CGID, | ||
| +ISERROR, | ||
| +TEXT | ||
| +) |
Oops, something went wrong.
0 comments on commit
70425d1