# Split the MIMIC IV dataset 
The goal in this notebook is to store smaller versions of the files in the MIMIC IV dataset for faster prototyping

In [1]:
import pandas as pd
import os
import random

In [2]:
# define paths for the original large files and target destinations for the smaller files
input_root  = '../data/real_world_data/physionet.org/files/mimiciv/3.0/'
output_root = '../data/real_world_data/physionet.org_small/files/mimiciv/3.0/'  

# two folders for mimic data 
input_root_icu  = input_root  +  'icu/'
output_root_icu = output_root +  'icu/'
input_root_hosp = input_root  +  'hosp/'
output_root_hosp = output_root +  'hosp/'

# start with icustays for the patient_ids 
icu_stays_name      = 'icustays.csv.gz'
icu_stays_path      = os.path.join(input_root_icu, icu_stays_name)
icu_stays_path_out  = os.path.join(output_root_icu, icu_stays_name)

In [3]:
# Step 1: Load unique subject IDs
def load_subject_ids(file_path, subject_id_col='subject_id', chunksize=100000):
    """Loads subject IDs from a .csv.gz file in chunks."""
    subject_ids = set()  # Use a set to avoid duplicates
    
    try:
        # Load the file in chunks and collect all subject_ids
        for chunk in pd.read_csv(file_path, compression='gzip', chunksize=chunksize):
            subject_ids.update(chunk[subject_id_col].unique())  # Add unique subject_ids to the set
        
        return list(subject_ids)  # Convert to a list to make it easier to sample
    except Exception as e:
        print(f"Error loading subject IDs from {file_path}: {e}")
        return None

# Step 2: Select a random 5% of subject IDs
def select_random_subject_ids(subject_ids, percentage=5):
    """Selects a random 5% sample of subject IDs."""
    sample_size = int(len(subject_ids) * (percentage / 100))
    return random.sample(subject_ids, sample_size)

def load_single_csv_gz(file_path, patient_ids, patient_id_col='subject_id', chunksize=100000, max_chunks=None):
    """Loads a single .csv.gz file in chunks and returns a DataFrame, filtering by patient_ids.
       Stops after processing a specified number of chunks (max_chunks)."""
    
    filtered_data = []
    patient_id_found = False  # Flag to check if patient_id column exists
    chunk_count = 0  # Initialize chunk counter

    try:
        # Read the file in chunks
        for chunk in pd.read_csv(file_path, compression='gzip', chunksize=chunksize):
            chunk_count += 1  # Increment chunk counter
            print(f"Processing chunk {chunk_count}")

            # Check for the presence of the patient_id column
            if patient_id_col in chunk.columns:
                patient_id_found = True  # Mark that we've found the patient_id column
                
                # Filter the chunk by patient_ids
                filtered_chunk = chunk[chunk[patient_id_col].isin(patient_ids)]
                filtered_data.append(filtered_chunk)
            
            # Stop processing if the max_chunks limit is reached
            if max_chunks and chunk_count >= max_chunks:
                print(f"Stopping after {chunk_count} chunks as per max_chunks limit.")
                break

        # If the patient_id column was found and filtered data exists, return that
        if patient_id_found and filtered_data:
            return pd.concat(filtered_data, ignore_index=True)
        else:
            # Return an empty DataFrame if no matching data or patient_id column not found
            print(f"Patient ID column '{patient_id_col}' not found or no matching patient IDs.")
            return pd.DataFrame()
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return pd.DataFrame()  # Return empty DataFrame on error


    
# Step 2: Save the filtered DataFrame to a .csv.gz file
def save_filtered_df(filtered_df, output_path):
    """Saves a pandas DataFrame to a .csv.gz file."""
    if filtered_df is not None and not filtered_df.empty:
        
        # Ensure the target folder exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        filtered_df.to_csv(output_path, compression='gzip', index=False)
        print(f"Filtered data saved to {output_path}")
    else:
        print(f"No data to save for {output_path}")

### Option 1: Randomly generate some subject ids to use

In [None]:
# Step 1: Load subject IDs from the file
subject_ids = load_subject_ids(icu_stays_path)

if subject_ids:
    # Step 2: Select 5% of the subject IDs randomly
    selected_subject_ids = select_random_subject_ids(subject_ids, percentage=3)
    print(f"Selected {len(selected_subject_ids)} subject IDs from {len(subject_ids)} total.")

    # Step 3: Filter the icustays file by these subject IDs
    filtered_df = load_single_csv_gz(icu_stays_path, selected_subject_ids)

    # Step 4: Save the filtered data
    save_filtered_df(filtered_df, icu_stays_path_out)
else:
    print(f"No subject IDs found in {icu_stays_path}")
    
subject_ids = selected_subject_ids

In [None]:
len(subject_ids)

### Option 2: Load the already chosen subject ids

In [4]:
subject_ids = load_subject_ids(icu_stays_path_out)

In [5]:
len(subject_ids)

1960

## Extract the smaller dataset versions for all files

In [None]:
file_name = 'inputevents.csv.gz'
new_file_path = os.path.join(input_root_icu, file_name)
new_output_path = os.path.join(output_root_icu, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, subject_ids)[0]

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'ingredientevents.csv.gz'
new_file_path = os.path.join(input_root_icu, file_name)
new_output_path = os.path.join(output_root_icu, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, subject_ids)[0]

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'outputevents.csv.gz'
new_file_path = os.path.join(input_root_icu, file_name)
new_output_path = os.path.join(output_root_icu, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'procedureevents.csv.gz'
new_file_path = os.path.join(input_root_icu, file_name)
new_output_path = os.path.join(output_root_icu, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [6]:
file_name = 'chartevents.csv.gz'
new_file_path = os.path.join(input_root_icu, file_name)
new_output_path = os.path.join(output_root_icu, file_name)

In [8]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

Processing chunk 1
Processing chunk 2
Processing chunk 3
Processing chunk 4
Processing chunk 5
Processing chunk 6
Processing chunk 7
Processing chunk 8
Processing chunk 9
Processing chunk 10
Processing chunk 11
Processing chunk 12
Processing chunk 13
Processing chunk 14
Processing chunk 15
Processing chunk 16
Processing chunk 17
Processing chunk 18
Processing chunk 19
Processing chunk 20
Processing chunk 21
Processing chunk 22
Processing chunk 23
Processing chunk 24
Processing chunk 25
Processing chunk 26
Processing chunk 27
Processing chunk 28
Processing chunk 29
Processing chunk 30
Processing chunk 31
Processing chunk 32
Processing chunk 33
Processing chunk 34
Processing chunk 35
Processing chunk 36
Processing chunk 37
Processing chunk 38
Processing chunk 39
Processing chunk 40
Processing chunk 41
Processing chunk 42
Processing chunk 43
Processing chunk 44
Processing chunk 45
Processing chunk 46
Processing chunk 47
Processing chunk 48
Processing chunk 49
Processing chunk 50
Processin

Processing chunk 397
Processing chunk 398
Processing chunk 399
Processing chunk 400
Processing chunk 401
Processing chunk 402
Processing chunk 403
Processing chunk 404
Processing chunk 405
Processing chunk 406
Processing chunk 407
Processing chunk 408
Processing chunk 409
Processing chunk 410
Processing chunk 411
Processing chunk 412
Processing chunk 413
Processing chunk 414
Processing chunk 415
Processing chunk 416
Processing chunk 417
Processing chunk 418
Processing chunk 419
Processing chunk 420
Processing chunk 421
Processing chunk 422
Processing chunk 423
Processing chunk 424
Processing chunk 425
Processing chunk 426
Processing chunk 427
Processing chunk 428
Processing chunk 429
Processing chunk 430
Processing chunk 431
Processing chunk 432
Processing chunk 433
Processing chunk 434
Processing chunk 435
Processing chunk 436
Processing chunk 437
Processing chunk 438
Processing chunk 439
Processing chunk 440
Processing chunk 441
Processing chunk 442
Processing chunk 443
Processing ch

Processing chunk 789
Processing chunk 790
Processing chunk 791
Processing chunk 792
Processing chunk 793
Processing chunk 794
Processing chunk 795
Processing chunk 796
Processing chunk 797
Processing chunk 798
Processing chunk 799
Processing chunk 800
Processing chunk 801
Processing chunk 802
Processing chunk 803
Processing chunk 804
Processing chunk 805
Processing chunk 806
Processing chunk 807
Processing chunk 808
Processing chunk 809
Processing chunk 810
Processing chunk 811
Processing chunk 812
Processing chunk 813
Processing chunk 814
Processing chunk 815
Processing chunk 816
Processing chunk 817
Processing chunk 818
Processing chunk 819
Processing chunk 820
Processing chunk 821
Processing chunk 822
Processing chunk 823
Processing chunk 824
Processing chunk 825
Processing chunk 826
Processing chunk 827
Processing chunk 828
Processing chunk 829
Processing chunk 830
Processing chunk 831
Processing chunk 832
Processing chunk 833
Processing chunk 834
Processing chunk 835
Processing ch

Processing chunk 1171
Processing chunk 1172
Processing chunk 1173
Processing chunk 1174
Processing chunk 1175
Processing chunk 1176
Processing chunk 1177
Processing chunk 1178
Processing chunk 1179
Processing chunk 1180
Processing chunk 1181
Processing chunk 1182
Processing chunk 1183
Processing chunk 1184
Processing chunk 1185
Processing chunk 1186
Processing chunk 1187
Processing chunk 1188
Processing chunk 1189
Processing chunk 1190
Processing chunk 1191
Processing chunk 1192
Processing chunk 1193
Processing chunk 1194
Processing chunk 1195
Processing chunk 1196
Processing chunk 1197
Processing chunk 1198
Processing chunk 1199
Processing chunk 1200
Processing chunk 1201
Processing chunk 1202
Processing chunk 1203
Processing chunk 1204
Processing chunk 1205
Processing chunk 1206
Processing chunk 1207
Processing chunk 1208
Processing chunk 1209
Processing chunk 1210
Processing chunk 1211
Processing chunk 1212
Processing chunk 1213
Processing chunk 1214
Processing chunk 1215
Processing

Processing chunk 1544
Processing chunk 1545
Processing chunk 1546
Processing chunk 1547
Processing chunk 1548
Processing chunk 1549
Processing chunk 1550
Processing chunk 1551
Processing chunk 1552
Processing chunk 1553
Processing chunk 1554
Processing chunk 1555
Processing chunk 1556
Processing chunk 1557
Processing chunk 1558
Processing chunk 1559
Processing chunk 1560
Processing chunk 1561
Processing chunk 1562
Processing chunk 1563
Processing chunk 1564
Processing chunk 1565
Processing chunk 1566
Processing chunk 1567
Processing chunk 1568
Processing chunk 1569
Processing chunk 1570
Processing chunk 1571
Processing chunk 1572
Processing chunk 1573
Processing chunk 1574
Processing chunk 1575
Processing chunk 1576
Processing chunk 1577
Processing chunk 1578
Processing chunk 1579
Processing chunk 1580
Processing chunk 1581
Processing chunk 1582
Processing chunk 1583
Processing chunk 1584
Processing chunk 1585
Processing chunk 1586
Processing chunk 1587
Processing chunk 1588
Processing

Processing chunk 1918
Processing chunk 1919
Processing chunk 1920
Processing chunk 1921
Processing chunk 1922
Processing chunk 1923
Processing chunk 1924
Processing chunk 1925
Processing chunk 1926
Processing chunk 1927
Processing chunk 1928
Processing chunk 1929
Processing chunk 1930
Processing chunk 1931
Processing chunk 1932
Processing chunk 1933
Processing chunk 1934
Processing chunk 1935
Processing chunk 1936
Processing chunk 1937
Processing chunk 1938
Processing chunk 1939
Processing chunk 1940
Processing chunk 1941
Processing chunk 1942
Processing chunk 1943
Processing chunk 1944
Processing chunk 1945
Processing chunk 1946
Processing chunk 1947
Processing chunk 1948
Processing chunk 1949
Processing chunk 1950
Processing chunk 1951
Processing chunk 1952
Processing chunk 1953
Processing chunk 1954
Processing chunk 1955
Processing chunk 1956
Processing chunk 1957
Processing chunk 1958
Processing chunk 1959
Processing chunk 1960
Processing chunk 1961
Processing chunk 1962
Processing

Processing chunk 2291
Processing chunk 2292
Processing chunk 2293
Processing chunk 2294
Processing chunk 2295
Processing chunk 2296
Processing chunk 2297
Processing chunk 2298
Processing chunk 2299
Processing chunk 2300
Processing chunk 2301
Processing chunk 2302
Processing chunk 2303
Processing chunk 2304
Processing chunk 2305
Processing chunk 2306
Processing chunk 2307
Processing chunk 2308
Processing chunk 2309
Processing chunk 2310
Processing chunk 2311
Processing chunk 2312
Processing chunk 2313
Processing chunk 2314
Processing chunk 2315
Processing chunk 2316
Processing chunk 2317
Processing chunk 2318
Processing chunk 2319
Processing chunk 2320
Processing chunk 2321
Processing chunk 2322
Processing chunk 2323
Processing chunk 2324
Processing chunk 2325
Processing chunk 2326
Processing chunk 2327
Processing chunk 2328
Processing chunk 2329
Processing chunk 2330
Processing chunk 2331
Processing chunk 2332
Processing chunk 2333
Processing chunk 2334
Processing chunk 2335
Processing

Processing chunk 2664
Processing chunk 2665
Processing chunk 2666
Processing chunk 2667
Processing chunk 2668
Processing chunk 2669
Processing chunk 2670
Processing chunk 2671
Processing chunk 2672
Processing chunk 2673
Processing chunk 2674
Processing chunk 2675
Processing chunk 2676
Processing chunk 2677
Processing chunk 2678
Processing chunk 2679
Processing chunk 2680
Processing chunk 2681
Processing chunk 2682
Processing chunk 2683
Processing chunk 2684
Processing chunk 2685
Processing chunk 2686
Processing chunk 2687
Processing chunk 2688
Processing chunk 2689
Processing chunk 2690
Processing chunk 2691
Processing chunk 2692
Processing chunk 2693
Processing chunk 2694
Processing chunk 2695
Processing chunk 2696
Processing chunk 2697
Processing chunk 2698
Processing chunk 2699
Processing chunk 2700
Processing chunk 2701
Processing chunk 2702
Processing chunk 2703
Processing chunk 2704
Processing chunk 2705
Processing chunk 2706
Processing chunk 2707
Processing chunk 2708
Processing

Processing chunk 3038
Processing chunk 3039
Processing chunk 3040
Processing chunk 3041
Processing chunk 3042
Processing chunk 3043
Processing chunk 3044
Processing chunk 3045
Processing chunk 3046
Processing chunk 3047
Processing chunk 3048
Processing chunk 3049
Processing chunk 3050
Processing chunk 3051
Processing chunk 3052
Processing chunk 3053
Processing chunk 3054
Processing chunk 3055
Processing chunk 3056
Processing chunk 3057
Processing chunk 3058
Processing chunk 3059
Processing chunk 3060
Processing chunk 3061
Processing chunk 3062
Processing chunk 3063
Processing chunk 3064
Processing chunk 3065
Processing chunk 3066
Processing chunk 3067
Processing chunk 3068
Processing chunk 3069
Processing chunk 3070
Processing chunk 3071
Processing chunk 3072
Processing chunk 3073
Processing chunk 3074
Processing chunk 3075
Processing chunk 3076
Processing chunk 3077
Processing chunk 3078
Processing chunk 3079
Processing chunk 3080
Processing chunk 3081
Processing chunk 3082
Processing

Processing chunk 3411
Processing chunk 3412
Processing chunk 3413
Processing chunk 3414
Processing chunk 3415
Processing chunk 3416
Processing chunk 3417
Processing chunk 3418
Processing chunk 3419
Processing chunk 3420
Processing chunk 3421
Processing chunk 3422
Processing chunk 3423
Processing chunk 3424
Processing chunk 3425
Processing chunk 3426
Processing chunk 3427
Processing chunk 3428
Processing chunk 3429
Processing chunk 3430
Processing chunk 3431
Processing chunk 3432
Processing chunk 3433
Processing chunk 3434
Processing chunk 3435
Processing chunk 3436
Processing chunk 3437
Processing chunk 3438
Processing chunk 3439
Processing chunk 3440
Processing chunk 3441
Processing chunk 3442
Processing chunk 3443
Processing chunk 3444
Processing chunk 3445
Processing chunk 3446
Processing chunk 3447
Processing chunk 3448
Processing chunk 3449
Processing chunk 3450
Processing chunk 3451
Processing chunk 3452
Processing chunk 3453
Processing chunk 3454
Processing chunk 3455
Processing

Processing chunk 3785
Processing chunk 3786
Processing chunk 3787
Processing chunk 3788
Processing chunk 3789
Processing chunk 3790
Processing chunk 3791
Processing chunk 3792
Processing chunk 3793
Processing chunk 3794
Processing chunk 3795
Processing chunk 3796
Processing chunk 3797
Processing chunk 3798
Processing chunk 3799
Processing chunk 3800
Processing chunk 3801
Processing chunk 3802
Processing chunk 3803
Processing chunk 3804
Processing chunk 3805
Processing chunk 3806
Processing chunk 3807
Processing chunk 3808
Processing chunk 3809
Processing chunk 3810
Processing chunk 3811
Processing chunk 3812
Processing chunk 3813
Processing chunk 3814
Processing chunk 3815
Processing chunk 3816
Processing chunk 3817
Processing chunk 3818
Processing chunk 3819
Processing chunk 3820
Processing chunk 3821
Processing chunk 3822
Processing chunk 3823
Processing chunk 3824
Processing chunk 3825
Processing chunk 3826
Processing chunk 3827
Processing chunk 3828
Processing chunk 3829
Processing

Processing chunk 4158
Processing chunk 4159
Processing chunk 4160
Processing chunk 4161
Processing chunk 4162
Processing chunk 4163
Processing chunk 4164
Processing chunk 4165
Processing chunk 4166
Processing chunk 4167
Processing chunk 4168
Processing chunk 4169
Processing chunk 4170
Processing chunk 4171
Processing chunk 4172
Processing chunk 4173
Processing chunk 4174
Processing chunk 4175
Processing chunk 4176
Processing chunk 4177
Processing chunk 4178
Processing chunk 4179
Processing chunk 4180
Processing chunk 4181
Processing chunk 4182
Processing chunk 4183
Processing chunk 4184
Processing chunk 4185
Processing chunk 4186
Processing chunk 4187
Processing chunk 4188
Processing chunk 4189
Processing chunk 4190
Processing chunk 4191
Processing chunk 4192
Processing chunk 4193
Processing chunk 4194
Processing chunk 4195
Processing chunk 4196
Processing chunk 4197
Processing chunk 4198
Processing chunk 4199
Processing chunk 4200
Processing chunk 4201
Processing chunk 4202
Processing

In [9]:
file_name = 'datetimeevents.csv.gz'
new_file_path = os.path.join(input_root_icu, file_name)
new_output_path = os.path.join(output_root_icu, file_name)

In [12]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

Processing chunk 1
Processing chunk 2
Processing chunk 3
Processing chunk 4
Processing chunk 5
Processing chunk 6
Processing chunk 7
Processing chunk 8
Processing chunk 9
Processing chunk 10
Processing chunk 11
Processing chunk 12
Processing chunk 13
Processing chunk 14
Processing chunk 15
Processing chunk 16
Processing chunk 17
Processing chunk 18
Processing chunk 19
Processing chunk 20
Processing chunk 21
Processing chunk 22
Processing chunk 23
Processing chunk 24
Processing chunk 25
Processing chunk 26
Processing chunk 27
Processing chunk 28
Processing chunk 29
Processing chunk 30
Processing chunk 31
Processing chunk 32
Processing chunk 33
Processing chunk 34
Processing chunk 35
Processing chunk 36
Processing chunk 37
Processing chunk 38
Processing chunk 39
Processing chunk 40
Processing chunk 41
Processing chunk 42
Processing chunk 43
Processing chunk 44
Processing chunk 45
Processing chunk 46
Processing chunk 47
Processing chunk 48
Processing chunk 49
Processing chunk 50
Processin

In [14]:
file_name = 'd_items.csv.gz'
new_file_path = os.path.join(input_root_icu, file_name)
new_output_path = os.path.join(output_root_icu, file_name)

In [15]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

Processing chunk 1
Patient ID column 'subject_id' not found or no matching patient IDs.
No data to save for ../data/real_world_data/physionet.org_small/files/mimiciv/3.0/icu/d_items.csv.gz


In [16]:
filtered_df

## Load other data not matchable

In [None]:
file_name = 'd_items.csv.gz'
new_file_path = os.path.join(input_path, file_name)
new_output_path = os.path.join(output_path, file_name)

In [None]:
filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)

# Save the filtered data
# save_filtered_df(filtered_df, new_output_path)

In [None]:
filtered_df

## Load Hosp data

In [17]:
file_name = 'patients.csv.gz'
new_file_path = os.path.join(input_root_hosp, file_name)
new_output_path = os.path.join(output_root_hosp, file_name)

In [19]:
filtered_df = load_single_csv_gz(new_file_path, subject_ids)
save_filtered_df(filtered_df, new_output_path)

Processing chunk 1
Processing chunk 2
Processing chunk 3
Processing chunk 4
Filtered data saved to ../data/real_world_data/physionet.org_small/files/mimiciv/3.0/hosp/patients.csv.gz


In [20]:
file_name = 'admissions.csv.gz'
new_file_path = os.path.join(input_root_hosp, file_name)
new_output_path = os.path.join(output_root_hosp, file_name)

In [21]:
filtered_df = load_single_csv_gz(new_file_path, subject_ids)
save_filtered_df(filtered_df, new_output_path)

Processing chunk 1
Processing chunk 2
Processing chunk 3
Processing chunk 4
Processing chunk 5
Processing chunk 6
Filtered data saved to ../data/real_world_data/physionet.org_small/files/mimiciv/3.0/hosp/admissions.csv.gz


In [23]:
file_name = 'diagnoses_icd.csv.gz'
new_file_path = os.path.join(input_root_hosp, file_name)
new_output_path = os.path.join(output_root_hosp, file_name)


filtered_df = load_single_csv_gz(new_file_path, subject_ids)
save_filtered_df(filtered_df, new_output_path)

Processing chunk 1
Processing chunk 2
Processing chunk 3
Processing chunk 4
Processing chunk 5
Processing chunk 6
Processing chunk 7
Processing chunk 8
Processing chunk 9
Processing chunk 10
Processing chunk 11
Processing chunk 12
Processing chunk 13
Processing chunk 14
Processing chunk 15
Processing chunk 16
Processing chunk 17
Processing chunk 18
Processing chunk 19
Processing chunk 20
Processing chunk 21
Processing chunk 22
Processing chunk 23
Processing chunk 24
Processing chunk 25
Processing chunk 26
Processing chunk 27
Processing chunk 28
Processing chunk 29
Processing chunk 30
Processing chunk 31
Processing chunk 32
Processing chunk 33
Processing chunk 34
Processing chunk 35
Processing chunk 36
Processing chunk 37
Processing chunk 38
Processing chunk 39
Processing chunk 40
Processing chunk 41
Processing chunk 42
Processing chunk 43
Processing chunk 44
Processing chunk 45
Processing chunk 46
Processing chunk 47
Processing chunk 48
Processing chunk 49
Processing chunk 50
Processin

In [24]:
file_name = 'labevents.csv.gz'
new_file_path = os.path.join(input_root_hosp, file_name)
new_output_path = os.path.join(output_root_hosp, file_name)


filtered_df = load_single_csv_gz(new_file_path, subject_ids)
save_filtered_df(filtered_df, new_output_path)

Processing chunk 1
Processing chunk 2
Processing chunk 3
Processing chunk 4
Processing chunk 5
Processing chunk 6
Processing chunk 7
Processing chunk 8
Processing chunk 9
Processing chunk 10
Processing chunk 11


  if (await self.run_code(code, result,  async_=asy)):


Processing chunk 12
Processing chunk 13
Processing chunk 14
Processing chunk 15
Processing chunk 16
Processing chunk 17
Processing chunk 18
Processing chunk 19
Processing chunk 20
Processing chunk 21
Processing chunk 22
Processing chunk 23
Processing chunk 24
Processing chunk 25
Processing chunk 26
Processing chunk 27
Processing chunk 28
Processing chunk 29
Processing chunk 30
Processing chunk 31
Processing chunk 32
Processing chunk 33
Processing chunk 34
Processing chunk 35
Processing chunk 36
Processing chunk 37
Processing chunk 38
Processing chunk 39
Processing chunk 40
Processing chunk 41
Processing chunk 42
Processing chunk 43
Processing chunk 44
Processing chunk 45
Processing chunk 46
Processing chunk 47
Processing chunk 48
Processing chunk 49
Processing chunk 50
Processing chunk 51
Processing chunk 52
Processing chunk 53
Processing chunk 54
Processing chunk 55
Processing chunk 56
Processing chunk 57
Processing chunk 58
Processing chunk 59
Processing chunk 60
Processing chunk 61


Processing chunk 407
Processing chunk 408
Processing chunk 409
Processing chunk 410
Processing chunk 411
Processing chunk 412
Processing chunk 413
Processing chunk 414
Processing chunk 415
Processing chunk 416
Processing chunk 417
Processing chunk 418
Processing chunk 419
Processing chunk 420
Processing chunk 421
Processing chunk 422
Processing chunk 423
Processing chunk 424
Processing chunk 425
Processing chunk 426
Processing chunk 427
Processing chunk 428
Processing chunk 429
Processing chunk 430
Processing chunk 431
Processing chunk 432
Processing chunk 433
Processing chunk 434
Processing chunk 435
Processing chunk 436
Processing chunk 437
Processing chunk 438
Processing chunk 439
Processing chunk 440
Processing chunk 441
Processing chunk 442
Processing chunk 443
Processing chunk 444
Processing chunk 445
Processing chunk 446
Processing chunk 447
Processing chunk 448
Processing chunk 449
Processing chunk 450
Processing chunk 451
Processing chunk 452
Processing chunk 453
Processing ch

Processing chunk 798
Processing chunk 799
Processing chunk 800
Processing chunk 801
Processing chunk 802
Processing chunk 803
Processing chunk 804
Processing chunk 805
Processing chunk 806
Processing chunk 807
Processing chunk 808
Processing chunk 809
Processing chunk 810
Processing chunk 811
Processing chunk 812
Processing chunk 813
Processing chunk 814
Processing chunk 815
Processing chunk 816
Processing chunk 817
Processing chunk 818
Processing chunk 819
Processing chunk 820
Processing chunk 821
Processing chunk 822
Processing chunk 823
Processing chunk 824
Processing chunk 825
Processing chunk 826
Processing chunk 827
Processing chunk 828
Processing chunk 829
Processing chunk 830
Processing chunk 831
Processing chunk 832
Processing chunk 833
Processing chunk 834
Processing chunk 835
Processing chunk 836
Processing chunk 837
Processing chunk 838
Processing chunk 839
Processing chunk 840
Processing chunk 841
Processing chunk 842
Processing chunk 843
Processing chunk 844
Processing ch

Processing chunk 1180
Processing chunk 1181
Processing chunk 1182
Processing chunk 1183
Processing chunk 1184
Processing chunk 1185
Processing chunk 1186
Processing chunk 1187
Processing chunk 1188
Processing chunk 1189
Processing chunk 1190
Processing chunk 1191
Processing chunk 1192
Processing chunk 1193
Processing chunk 1194
Processing chunk 1195
Processing chunk 1196
Processing chunk 1197
Processing chunk 1198
Processing chunk 1199
Processing chunk 1200
Processing chunk 1201
Processing chunk 1202
Processing chunk 1203
Processing chunk 1204
Processing chunk 1205
Processing chunk 1206
Processing chunk 1207
Processing chunk 1208
Processing chunk 1209
Processing chunk 1210
Processing chunk 1211
Processing chunk 1212
Processing chunk 1213
Processing chunk 1214
Processing chunk 1215
Processing chunk 1216
Processing chunk 1217
Processing chunk 1218
Processing chunk 1219
Processing chunk 1220
Processing chunk 1221
Processing chunk 1222
Processing chunk 1223
Processing chunk 1224
Processing

Processing chunk 1553
Processing chunk 1554
Processing chunk 1555
Processing chunk 1556
Processing chunk 1557
Processing chunk 1558
Processing chunk 1559
Processing chunk 1560
Processing chunk 1561
Processing chunk 1562
Processing chunk 1563
Processing chunk 1564
Processing chunk 1565
Processing chunk 1566
Processing chunk 1567
Processing chunk 1568
Processing chunk 1569
Processing chunk 1570
Processing chunk 1571
Processing chunk 1572
Processing chunk 1573
Processing chunk 1574
Processing chunk 1575
Processing chunk 1576
Processing chunk 1577
Processing chunk 1578
Processing chunk 1579
Processing chunk 1580
Processing chunk 1581
Processing chunk 1582
Processing chunk 1583
Processing chunk 1584
Processing chunk 1585
Filtered data saved to ../data/real_world_data/physionet.org_small/files/mimiciv/3.0/hosp/labevents.csv.gz


In [25]:
file_name = 'microbiologyevents.csv.gz'
new_file_path = os.path.join(input_root_hosp, file_name)
new_output_path = os.path.join(output_root_hosp, file_name)


filtered_df = load_single_csv_gz(new_file_path, subject_ids)
save_filtered_df(filtered_df, new_output_path)

  if (await self.run_code(code, result,  async_=asy)):


Processing chunk 1
Processing chunk 2
Processing chunk 3
Processing chunk 4
Processing chunk 5
Processing chunk 6
Processing chunk 7
Processing chunk 8
Processing chunk 9
Processing chunk 10
Processing chunk 11
Processing chunk 12
Processing chunk 13
Processing chunk 14
Processing chunk 15
Processing chunk 16
Processing chunk 17
Processing chunk 18
Processing chunk 19
Processing chunk 20
Processing chunk 21
Processing chunk 22
Processing chunk 23
Processing chunk 24
Processing chunk 25
Processing chunk 26
Processing chunk 27
Processing chunk 28
Processing chunk 29
Processing chunk 30
Processing chunk 31
Processing chunk 32
Processing chunk 33
Processing chunk 34
Processing chunk 35
Processing chunk 36
Processing chunk 37
Processing chunk 38
Processing chunk 39
Processing chunk 40
Filtered data saved to ../data/real_world_data/physionet.org_small/files/mimiciv/3.0/hosp/microbiologyevents.csv.gz


In [26]:
file_name = 'prescriptions.csv.gz'
new_file_path = os.path.join(input_root_hosp, file_name)
new_output_path = os.path.join(output_root_hosp, file_name)


filtered_df = load_single_csv_gz(new_file_path, subject_ids)
save_filtered_df(filtered_df, new_output_path)

  if (await self.run_code(code, result,  async_=asy)):


Processing chunk 1
Processing chunk 2
Processing chunk 3


  if (await self.run_code(code, result,  async_=asy)):


Processing chunk 4
Processing chunk 5
Processing chunk 6
Processing chunk 7
Processing chunk 8
Processing chunk 9
Processing chunk 10
Processing chunk 11
Processing chunk 12
Processing chunk 13
Processing chunk 14
Processing chunk 15
Processing chunk 16
Processing chunk 17
Processing chunk 18
Processing chunk 19
Processing chunk 20
Processing chunk 21
Processing chunk 22


  if (await self.run_code(code, result,  async_=asy)):


Processing chunk 23
Processing chunk 24
Processing chunk 25
Processing chunk 26
Processing chunk 27
Processing chunk 28
Processing chunk 29
Processing chunk 30
Processing chunk 31
Processing chunk 32
Processing chunk 33
Processing chunk 34
Processing chunk 35
Processing chunk 36
Processing chunk 37
Processing chunk 38
Processing chunk 39
Processing chunk 40
Processing chunk 41
Processing chunk 42
Processing chunk 43
Processing chunk 44
Processing chunk 45
Processing chunk 46
Processing chunk 47
Processing chunk 48
Processing chunk 49
Processing chunk 50
Processing chunk 51
Processing chunk 52
Processing chunk 53
Processing chunk 54
Processing chunk 55
Processing chunk 56
Processing chunk 57
Processing chunk 58
Processing chunk 59
Processing chunk 60
Processing chunk 61
Processing chunk 62
Processing chunk 63
Processing chunk 64
Processing chunk 65
Processing chunk 66
Processing chunk 67
Processing chunk 68
Processing chunk 69
Processing chunk 70
Processing chunk 71
Processing chunk 72
