Permalink
Browse files

fix up oracle build, refers to #122

  • Loading branch information...
1 parent dcfde93 commit 70425d1c5fb3c90e5821940b03929f9e54aee089 @alistairewj alistairewj committed Sep 4, 2016
@@ -0,0 +1,47 @@
+# Loading data in Oracle
+
+This folder contains scripts for loading the data into an Oracle database system.
+There are three steps to loading the data into Oracle:
+
+1. The tables are created using an SQL script
+2. `sqlldr` is used to load data into these tables (either individually or with a shell script)
+3. Indices and constraints are added using an SQL script
+
+The largest challenge in loading in data in Oracle is NOTEEVENTS. Oracle cannot handle reasonably standard formatted CSVs: instead it requires a special character to delineate a newline from a new row (most database systems can handle newlines if they are quoted, which is how the data for MIMIC-III is distributed). As a result, we have provided an additional Python script which will run through NOTEEVENTS and append a unique string of characters to the end of each row. The script can be called as follows:
+
+```python
+python add_oracle_rowdelimiter.py -d ',' -i 'NOTEEVENTS.csv' -r '><><?~`;;`'
+```
+
+The function will output a file called `NOTEEVENTS_output.csv`. You may then use the noteevents_output.ctl file to load in the data, e.g. where the tables are on the MIMICIII user's schema:
+
+```sqlldr '\ as SYSDBA' control=noteevents_output.ctl log=noteevents_output.log```
+
+# Step by step guide for loading data
+
+A shell script called `build_mimic_oracle.sh` is provided to build MIMIC on an Oracle database, but note you will likely need to customize this script for your own purposes, including:
+
+* modifying the default schema
+* changing your authentication
+
+Alternatively, you can follow this step by step guide:
+
+1. Download the MIMIC-III data and extract it to a working folder
+2. Add the control files, SQL scripts, and python script to the working folder
+3. Open up either `sqlplus` or SQLDeveloper
+4. Convert NOTEEVENTS.csv into a workable format: `python add_oracle_rowdelimiter.py -d ',' -i 'NOTEEVENTS.csv' -r '><><?~`;;`'`
+5. Run `oracle_create_tables.sql`
+6. Add permissions to run the bash script: `chmod a+x load_data_oracle.sh`
+7. Run the bash script: `./load_data_oracle.sh`
+ * It helps to be logged in as the Oracle user, as the script uses OS authentication to speed things up
+ * If you do not know how to use OS authentication, remove '/ as SYSDBA' from the beginning of each line in the script
+8. Run the `oracle_add_indexes.sql` and `oracle_add_constrants.sql` files
+
+These scripts have yet to be fully tested and we would welcome pull requests, bug reports, or suggestions via GitHub.
+
+# Load time
+
+Load time for chartevents (the largest table) with partitions is ~30min
+
+Elapsed time was: 00:33:46.44
+CPU time was: 00:28:30.03
@@ -0,0 +1,100 @@
+from __future__ import print_function
+
+import sys
+import getopt
+import os.path
+
+import csv
+
+# This function takes a CSV file and appends a string to the end of each row
+# This facilitates using the CSV with programs which can't handle newlines in fields
+# For example, Oracle's SQLLDR requires a unique string at the end of each row to indicate the row delimiter.
+
+# FUNCTION ASSUMPTIONS:
+# 1) file is in proper CSV format, where "proper" is defined as:
+# comma delimited
+# if a string contains a comma, it is double quoted
+# if a string contains a newline, it is double quoted
+# double quotes occurring within a string are escaped by another double quote
+# 2) file does *not* have a header row
+
+def main(argv):
+ """
+ Run `Remove newlines` from a CSV file.
+
+ Arguments
+ ----------
+ -h: print help
+ -i: str
+ Absolute path to a valid CSV file.
+ -d: str
+ Delimiter (','). For delimiters with special characters, quote the delimiter in apostrophes.
+ """
+ # parse input arguments
+ fn_in=''
+ delimiter=''
+ fn_sql=''
+
+ output_type = 'oracle'
+ newline_char = '\n'
+ oracle_newrow = '><><?~`;;`'
+
+ # 'oracle' - delete newlines, replace with spaces.
+ try:
+ opts, args = getopt.getopt(argv,"hi:d:c:o",["ifile=","delimiter=","ctl="])
+ except getopt.GetoptError:
+ print('remove_newlines.py -i <input_file> -d <delimiter> -r <row_delimiter>')
+ sys.exit(2)
+ for opt, arg in opts:
+ if opt == '-h':
+ print('remove_newlines.py -i <input_file> -d <delimiter> -r <row_delimiter>')
+ sys.exit()
+ elif opt in ("-i", "--ifile"):
+ fn_in = arg
+ elif opt in ("-d", "--delimiter"):
+ delimiter = arg
+ elif opt in ("-r", "--row-delimiter"):
+ oracle_newrow = arg
+
+ # input argument checking
+ if os.path.isfile(fn_in) == 1:
+ print('Using input file {}'.format(fn_in))
+ else:
+ print('Cannot find input file {}'.format(fn_in))
+ sys.exit(2)
+
+ fn_out=fn_in.strip('.csv')+'_output.csv'
+ print('\n'+'~'*40)
+ print('Input filename = {}'.format(fn_in))
+ print('Delimiter = {}'.format(delimiter))
+ print('New row character(s) = {}'.format(oracle_newrow))
+ print('Output filename = {}'.format(fn_out))
+ print('Please note all output fields will be double quoted.')
+ print('~'*40+'\n')
+ #raw_input('Press any key to continue.')
+
+ with open(fn_in , 'rb') as input_file:
+ reader = csv.reader(input_file, delimiter=',',
+ doublequote=True,
+ quoting=csv.QUOTE_MINIMAL)
+ # QUOTE_NONNUMERIC doesn't work because it tries to convert dates are floats
+ # consequently, all fields are output quoted
+ # not a big deal for oracle, which has optionally enclosed by double quotes parameter
+
+ with open(fn_out,'wb') as fout:
+ out = csv.writer(fout, doublequote=True, quoting=csv.QUOTE_NONNUMERIC,
+ lineterminator=oracle_newrow + '\r\n')
+ for row in reader:
+ out.writerow(row)
+ if reader.line_num % 100000 == 0:
+ print('Finished {} million lines.'.format(reader.line_num / 1000000))
+
+ # Summarise output
+ print('\n'+'~'*40)
+ print('Merging complete\n')
+ print('Number of rows processed: {}'.format(reader.line_num))
+ print('New file created: {}'.format(fout.name))
+ print('~'*40)
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# This script attempts to build MIMIC on an Oracle instance.
+# You will likely need to modify it to fit your own system, for example,
+# you may need to change the authentication, e.g. replace '\ as SYSDBA' with 'myusername/mypassword'
+# The script requires sqlldr and sqlplus
+
+
+# Create the tables
+
+sqlplus '\ as SYSDBA' << EOF
+WHENEVER OSERROR EXIT 9;
+WHENEVER SQLERROR EXIT SQL.SQLCODE;
+
+ALTER SESSION SET CURRENT_SCHEMA = MIMICIII;
+@oracle_create_tables.sql
+
+EOF
+
+# Alternatively, you could specify a username/password here, and use the below snippet
+
+#db_username=
+#db_password=
+
+#sqlplus -s /nolog << EOF
+#WHENEVER OSERROR EXIT 9;
+#WHENEVER SQLERROR EXIT SQL.SQLCODE;
+#CONNECT ${db_username}/${db_password};
+#@oracle_create_tables.sql
+#EOF
+
+
+# Call sqlldr to load the data
+
+sqlldr '\ as SYSDBA' control='controlfiles/admissions.ctl' log=admissions.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/callout.ctl' log=callout.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/caregivers.ctl' log=caregivers.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/chartevents.ctl' log=chartevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/cptevents.ctl' log=cptevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/d_cpt.ctl' log=d_cpt.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/d_icd_diagnoses.ctl' log=d_icd_diagnoses.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/d_icd_procedures.ctl' log=d_icd_procedures.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/d_items.ctl' log=d_items.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/d_labitems.ctl' log=d_labitems.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/datetimeevents.ctl' log=datetimeevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/diagnoses_icd.ctl' log=diagnoses_icd.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/drgcodes.ctl' log=drgcodes.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/icustays.ctl' log=icustays.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/inputevents_cv.ctl' log=inputevents_cv.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/inputevents_mv.ctl' log=inputevents_mv.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/labevents.ctl' log=labevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/microbiologyevents.ctl' log=microbiologyevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/noteevents_output.ctl' log=noteevents_output.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/outputevents.ctl' log=outputevents.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/patients.ctl' log=patients.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/prescriptions.ctl' log=prescriptions.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/procedureevents_mv.ctl' log=procedureevents_mv.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/procedures_icd.ctl' log=procedures_icd.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/services.ctl' log=services.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+sqlldr '\ as SYSDBA' control='controlfiles/transfers.ctl' log=transfers.log parallel=true direct=true multithreading=true skip_index_maintenance=true
+
+# Now, create the indexes and constraints
+
+sqlplus '\ as SYSDBA' << EOF
+WHENEVER OSERROR EXIT 9;
+WHENEVER SQLERROR EXIT SQL.SQLCODE;
+
+ALTER SESSION SET CURRENT_SCHEMA = MIMICIII;
+@oracle_add_indexes.sql
+@oracle_add_constraints.sql
+
+EOF
@@ -2,7 +2,7 @@ OPTIONS (
skip=1,
errors=0,
direct=true,
-multithreading=true
+multithreading=true
)
LOAD DATA
INFILE 'NOTEEVENTS.csv' "str '\n'"
@@ -14,7 +14,7 @@ FIELDS TERMINATED BY ','
OPTIONALLY ENCLOSED BY '"' AND '"'
TRAILING nullcols
(
-ROW_ID,
+ROW_ID,
SUBJECT_ID,
HADM_ID,
CHARTDATE DATE "YYYY-MM-DD HH24:MI:SS",
@@ -24,5 +24,5 @@ CATEGORY,
DESCRIPTION,
CGID,
ISERROR,
-TEXT
-)
+TEXT
+)
@@ -0,0 +1,28 @@
+OPTIONS (
+skip=1,
+errors=0,
+direct=true,
+multithreading=true
+)
+LOAD DATA
+INFILE 'NOTEEVENTS.csv' "str '\n'><><?~`;;`"
+BADFILE 'logfile.bad'
+DISCARDFILE 'logfile.discard'
+APPEND
+INTO TABLE noteevents
+FIELDS TERMINATED BY ','
+OPTIONALLY ENCLOSED BY '"' AND '"'
+TRAILING nullcols
+(
+ROW_ID,
+SUBJECT_ID,
+HADM_ID,
+CHARTDATE DATE "YYYY-MM-DD HH24:MI:SS",
+CHARTTIME DATE "YYYY-MM-DD HH24:MI:SS",
+STORETIME DATE "YYYY-MM-DD HH24:MI:SS",
+CATEGORY,
+DESCRIPTION,
+CGID,
+ISERROR,
+TEXT
+)
Oops, something went wrong.

0 comments on commit 70425d1

Please sign in to comment.