From 70425d1c5fb3c90e5821940b03929f9e54aee089 Mon Sep 17 00:00:00 2001 From: Alistair Johnson Date: Sun, 4 Sep 2016 16:37:40 -0400 Subject: [PATCH] fix up oracle build, refers to #122 --- buildmimic/oracle/README.md | 47 ++ buildmimic/oracle/add_oracle_rowdelimiter.py | 100 +++ buildmimic/oracle/build_mimic_oracle.sh | 72 +++ buildmimic/oracle/controlfiles/noteevents.ctl | 8 +- .../oracle/controlfiles/noteevents_output.ctl | 28 + buildmimic/oracle/oracle_add_constraints.sql | 97 ++- buildmimic/oracle/oracle_add_indexes.sql | 2 +- buildmimic/oracle/oracle_create_tables.sql | 704 ++++++++++----------- buildmimic/oracle/oracle_load_data.md | 12 - 9 files changed, 680 insertions(+), 390 deletions(-) create mode 100644 buildmimic/oracle/README.md create mode 100644 buildmimic/oracle/add_oracle_rowdelimiter.py create mode 100644 buildmimic/oracle/build_mimic_oracle.sh create mode 100644 buildmimic/oracle/controlfiles/noteevents_output.ctl delete mode 100644 buildmimic/oracle/oracle_load_data.md diff --git a/buildmimic/oracle/README.md b/buildmimic/oracle/README.md new file mode 100644 index 0000000..62e19a7 --- /dev/null +++ b/buildmimic/oracle/README.md @@ -0,0 +1,47 @@ +# Loading data in Oracle + +This folder contains scripts for loading the data into an Oracle database system. +There are three steps to loading the data into Oracle: + +1. The tables are created using an SQL script +2. `sqlldr` is used to load data into these tables (either individually or with a shell script) +3. Indices and constraints are added using an SQL script + +The largest challenge in loading in data in Oracle is NOTEEVENTS. Oracle cannot handle reasonably standard formatted CSVs: instead it requires a special character to delineate a newline from a new row (most database systems can handle newlines if they are quoted, which is how the data for MIMIC-III is distributed). As a result, we have provided an additional Python script which will run through NOTEEVENTS and append a unique string of characters to the end of each row. The script can be called as follows: + +```python +python add_oracle_rowdelimiter.py -d ',' -i 'NOTEEVENTS.csv' -r '><><><> -d -r ') + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print('remove_newlines.py -i -d -r ') + sys.exit() + elif opt in ("-i", "--ifile"): + fn_in = arg + elif opt in ("-d", "--delimiter"): + delimiter = arg + elif opt in ("-r", "--row-delimiter"): + oracle_newrow = arg + + # input argument checking + if os.path.isfile(fn_in) == 1: + print('Using input file {}'.format(fn_in)) + else: + print('Cannot find input file {}'.format(fn_in)) + sys.exit(2) + + fn_out=fn_in.strip('.csv')+'_output.csv' + print('\n'+'~'*40) + print('Input filename = {}'.format(fn_in)) + print('Delimiter = {}'.format(delimiter)) + print('New row character(s) = {}'.format(oracle_newrow)) + print('Output filename = {}'.format(fn_out)) + print('Please note all output fields will be double quoted.') + print('~'*40+'\n') + #raw_input('Press any key to continue.') + + with open(fn_in , 'rb') as input_file: + reader = csv.reader(input_file, delimiter=',', + doublequote=True, + quoting=csv.QUOTE_MINIMAL) + # QUOTE_NONNUMERIC doesn't work because it tries to convert dates are floats + # consequently, all fields are output quoted + # not a big deal for oracle, which has optionally enclosed by double quotes parameter + + with open(fn_out,'wb') as fout: + out = csv.writer(fout, doublequote=True, quoting=csv.QUOTE_NONNUMERIC, + lineterminator=oracle_newrow + '\r\n') + for row in reader: + out.writerow(row) + if reader.line_num % 100000 == 0: + print('Finished {} million lines.'.format(reader.line_num / 1000000)) + + # Summarise output + print('\n'+'~'*40) + print('Merging complete\n') + print('Number of rows processed: {}'.format(reader.line_num)) + print('New file created: {}'.format(fout.name)) + print('~'*40) + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/buildmimic/oracle/build_mimic_oracle.sh b/buildmimic/oracle/build_mimic_oracle.sh new file mode 100644 index 0000000..49c73cf --- /dev/null +++ b/buildmimic/oracle/build_mimic_oracle.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +# This script attempts to build MIMIC on an Oracle instance. +# You will likely need to modify it to fit your own system, for example, +# you may need to change the authentication, e.g. replace '\ as SYSDBA' with 'myusername/mypassword' +# The script requires sqlldr and sqlplus + + +# Create the tables + +sqlplus '\ as SYSDBA' << EOF +WHENEVER OSERROR EXIT 9; +WHENEVER SQLERROR EXIT SQL.SQLCODE; + +ALTER SESSION SET CURRENT_SCHEMA = MIMICIII; +@oracle_create_tables.sql + +EOF + +# Alternatively, you could specify a username/password here, and use the below snippet + +#db_username= +#db_password= + +#sqlplus -s /nolog << EOF +#WHENEVER OSERROR EXIT 9; +#WHENEVER SQLERROR EXIT SQL.SQLCODE; +#CONNECT ${db_username}/${db_password}; +#@oracle_create_tables.sql +#EOF + + +# Call sqlldr to load the data + +sqlldr '\ as SYSDBA' control='controlfiles/admissions.ctl' log=admissions.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/callout.ctl' log=callout.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/caregivers.ctl' log=caregivers.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/chartevents.ctl' log=chartevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/cptevents.ctl' log=cptevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/d_cpt.ctl' log=d_cpt.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/d_icd_diagnoses.ctl' log=d_icd_diagnoses.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/d_icd_procedures.ctl' log=d_icd_procedures.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/d_items.ctl' log=d_items.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/d_labitems.ctl' log=d_labitems.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/datetimeevents.ctl' log=datetimeevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/diagnoses_icd.ctl' log=diagnoses_icd.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/drgcodes.ctl' log=drgcodes.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/icustays.ctl' log=icustays.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/inputevents_cv.ctl' log=inputevents_cv.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/inputevents_mv.ctl' log=inputevents_mv.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/labevents.ctl' log=labevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/microbiologyevents.ctl' log=microbiologyevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/noteevents_output.ctl' log=noteevents_output.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/outputevents.ctl' log=outputevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/patients.ctl' log=patients.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/prescriptions.ctl' log=prescriptions.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/procedureevents_mv.ctl' log=procedureevents_mv.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/procedures_icd.ctl' log=procedures_icd.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/services.ctl' log=services.log parallel=true direct=true multithreading=true skip_index_maintenance=true +sqlldr '\ as SYSDBA' control='controlfiles/transfers.ctl' log=transfers.log parallel=true direct=true multithreading=true skip_index_maintenance=true + +# Now, create the indexes and constraints + +sqlplus '\ as SYSDBA' << EOF +WHENEVER OSERROR EXIT 9; +WHENEVER SQLERROR EXIT SQL.SQLCODE; + +ALTER SESSION SET CURRENT_SCHEMA = MIMICIII; +@oracle_add_indexes.sql +@oracle_add_constraints.sql + +EOF diff --git a/buildmimic/oracle/controlfiles/noteevents.ctl b/buildmimic/oracle/controlfiles/noteevents.ctl index c034c9c..7739582 100644 --- a/buildmimic/oracle/controlfiles/noteevents.ctl +++ b/buildmimic/oracle/controlfiles/noteevents.ctl @@ -2,7 +2,7 @@ OPTIONS ( skip=1, errors=0, direct=true, -multithreading=true +multithreading=true ) LOAD DATA INFILE 'NOTEEVENTS.csv' "str '\n'" @@ -14,7 +14,7 @@ FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' AND '"' TRAILING nullcols ( -ROW_ID, +ROW_ID, SUBJECT_ID, HADM_ID, CHARTDATE DATE "YYYY-MM-DD HH24:MI:SS", @@ -24,5 +24,5 @@ CATEGORY, DESCRIPTION, CGID, ISERROR, -TEXT -) \ No newline at end of file +TEXT +) diff --git a/buildmimic/oracle/controlfiles/noteevents_output.ctl b/buildmimic/oracle/controlfiles/noteevents_output.ctl new file mode 100644 index 0000000..5c8aa55 --- /dev/null +++ b/buildmimic/oracle/controlfiles/noteevents_output.ctl @@ -0,0 +1,28 @@ +OPTIONS ( +skip=1, +errors=0, +direct=true, +multithreading=true +) +LOAD DATA +INFILE 'NOTEEVENTS.csv' "str '\n'><>