In [1]:
# dep: input: ../data/Datathon_2006_2015_CORE_APD_clean.csv
# dep: output: ../data/data_extract.csv

# Extract Cardiac Arrest subset from ANZICS data

In [2]:
# Print Python version
!python -V

Python 3.6.3 :: Anaconda custom (64-bit)


## Load CSV into pandas
Import as pandas dataframe.

In [3]:
import pandas as pd

# Print pandas version
pd.__version__

  (fname, cnt))
  (fname, cnt))


'0.22.0'

Create routine to chunk the csv file, otherwise will run out of memory and kernel will hang.

In [4]:
def load_csv_into_pandas_using_chunks_with_filter(csv_file, chunkfilter, chunksize=20000):
    # This times out
    # out = pd.read_csv(csv_file, skipinitialspace=True)

    # So use chunks
    print("Loading CSV in chunks from:", csv_file)
    
    out = None
    
    chunk = 0
    for rows in pd.read_csv(csv_file, chunksize=chunksize, skipinitialspace=True):
        chunk += 1
        rows_filtered = chunkfilter(rows)
        if out is None:
            # print("Copy")
            out = rows_filtered.copy()
        else:
            # print("Append")
            out = out.append(rows_filtered)

        # out.head()
        # print(out.patientid)
        
        print("chunk:", chunk, "rows:", rows.shape, "filtered:", rows_filtered.shape, "out:", out.shape)
        
    print("Complete, shape:", out.shape)
    
    return out

Retrieve the full ANZICS data file as a pandas object

In [5]:
#Load updated CSV into notebooks
data_extract = load_csv_into_pandas_using_chunks_with_filter('../data/ANZICS_data.csv',
                    lambda df: df[(df.ap3_subcode.isin([102.01])) | (df.ap3diag == 102) | (df.cardarrest == 1)])

Loading CSV in chunks from: ../data/ANZICS_data.csv
chunk: 1 rows: (20000, 168) filtered: (675, 168) out: (675, 168)
chunk: 2 rows: (20000, 168) filtered: (644, 168) out: (1319, 168)
chunk: 3 rows: (20000, 168) filtered: (574, 168) out: (1893, 168)
chunk: 4 rows: (20000, 168) filtered: (566, 168) out: (2459, 168)
chunk: 5 rows: (20000, 168) filtered: (594, 168) out: (3053, 168)
chunk: 6 rows: (20000, 168) filtered: (613, 168) out: (3666, 168)
chunk: 7 rows: (20000, 168) filtered: (579, 168) out: (4245, 168)
chunk: 8 rows: (20000, 168) filtered: (593, 168) out: (4838, 168)
chunk: 9 rows: (20000, 168) filtered: (584, 168) out: (5422, 168)
chunk: 10 rows: (20000, 168) filtered: (623, 168) out: (6045, 168)
chunk: 11 rows: (20000, 168) filtered: (575, 168) out: (6620, 168)
chunk: 12 rows: (20000, 168) filtered: (555, 168) out: (7175, 168)
chunk: 13 rows: (20000, 168) filtered: (562, 168) out: (7737, 168)
chunk: 14 rows: (20000, 168) filtered: (605, 168) out: (8342, 168)
chunk: 15 rows: (200

Let's have a look at the ANZICS data

In [6]:
data_extract.head()

Unnamed: 0,ïsiteid,hospitalclassificationid,hospitalclassification,publicprivateid,publicprivate,patientid,icuadmityyyymm,icuadmitfinyr,icuadmityyyy,icuadmitmonth,...,anzrodisincluded,anzrodissmr,anzrodriskofdeath,apache3isincluded,apache3issmr,apache3riskofdeath,apache3score,apache2score,infectedall1,majordiag
3,47,1,Rural / Regional,4,Public,2,201710,2017-18,2017,10,...,1,1,0.032667,1,1,0.080221,61.0,19.0,0,med_other
66,47,1,Rural / Regional,4,Public,230,201710,2017-18,2017,10,...,1,1,0.215668,1,0,0.130564,53.0,25.0,0,cardiacarrest
123,108,3,Tertiary,4,Public,5838590P120,201709,2017-18,2017,9,...,1,1,0.652234,1,1,0.632856,109.0,30.0,0,cardiacarrest
136,178,3,Tertiary,4,Public,2270096,201709,2017-18,2017,9,...,1,1,0.156578,1,1,0.337196,75.0,15.0,0,cardiacarrest
147,223,3,Tertiary,4,Public,3032270,201709,2017-18,2017,9,...,1,1,0.102439,1,1,0.057196,39.0,10.0,0,cardiacarrest


And check the shape of the data

In [7]:
data_extract_shape = data_extract.shape[0]
print(data_extract_shape)

48165


## Inspect the data
Let's check the value counts for our ap3 diag code/subcode features

In [8]:
data_extract['ap3_subcode'].value_counts(dropna=False)

 102.01     20586
NaN         17814
 101.01       768
 107.01       630
 1207.01      435
 703.03       344
 106.01       270
 503.01       246
 203.01       201
 402.02       201
 207.01       190
 102.02       181
 1208.06      178
 206.01       162
 212.01       160
 104.01       141
 1206.03      131
 601.01       117
 109.01       116
 1405.02      114
 410.01       109
 208.01       103
 401.01       103
 1902.05      101
 1212.03       88
 407.01        84
 1403.03       82
 1401.01       82
 403.01        81
 1902.03       80
            ...  
 601.07         1
 1506.04        1
 1602.16        1
 901.06         1
 802.06         1
 703.09         1
 602.05         1
 801.01         1
 406.01         1
 901.05         1
 1602.03        1
 1208.02        1
 703.08         1
 1604.03        1
 109.04         1
 1213.02        1
 604.05         1
 1703.01        1
 409.01         1
 1101.03        1
 312.05         1
 604.04         1
 602.17         1
 1304.03        1
 1506.05  

In [9]:
data_extract['ap3diag'].value_counts(dropna=False)

102     32388
101      1319
107      1052
1207      652
1208      579
703       527
203       513
503       488
106       444
601       403
0         387
1902      373
402       345
109       336
212       334
501       330
207       306
1602      282
211       274
1405      262
206       248
1403      244
1206      241
1408      241
104       215
602       212
401       182
1601      170
410       164
407       163
        ...  
1205       25
310        25
1803       24
801        24
802        23
1412       22
406        22
1413       22
701        20
1301       19
1211       18
1704       17
1407       16
903        16
1604       16
308        15
108        15
111        12
1101       12
1102       11
409        11
1603        9
1411        9
105         8
1605        7
902         7
2101        6
405         5
312         5
210         1
Name: ap3diag, Length: 117, dtype: int64

## Save Cardiac Arrest data to CSV
Export to CSV

In [11]:
# save the extract data 
data_extract.to_csv('../data/cardiac_data.csv')