# Split the MIMIC IV dataset 
The goal in this notebook is to store smaller versions of the files in the MIMIC IV dataset for faster prototyping

In [None]:
import pandas as pd
import os
import random

In [None]:
# define paths for the original large files and target destinations for the smaller files
input_root  = '../data/real_world_data/physionet.org/files/mimiciv/3.0/'
output_root = '../data/real_world_data/physionet.org_small/files/mimiciv/3.0/'  

# two folders for mimic data 
input_root_icu  = input_root  +  'icu/'
output_root_icu = output_root +  'icu/'
input_root_hosp = input_root  +  'hosp/'
output_root_hosp = output_root +  'hosp/'

# start with icustays for the patient_ids 
icu_stays_name      = 'icustays.csv.gz'
icu_stays_path      = os.path.join(input_root_icu, icu_stays_name)
icu_stays_path_out  = os.path.join(output_root_icu, icu_stays_name)

In [None]:
# Step 1: Load unique subject IDs
def load_subject_ids(file_path, subject_id_col='subject_id', chunksize=100000):
    """Loads subject IDs from a .csv.gz file in chunks."""
    subject_ids = set()  # Use a set to avoid duplicates
    
    try:
        # Load the file in chunks and collect all subject_ids
        for chunk in pd.read_csv(file_path, compression='gzip', chunksize=chunksize):
            subject_ids.update(chunk[subject_id_col].unique())  # Add unique subject_ids to the set
        
        return list(subject_ids)  # Convert to a list to make it easier to sample
    except Exception as e:
        print(f"Error loading subject IDs from {file_path}: {e}")
        return None

# Step 2: Select a random 5% of subject IDs
def select_random_subject_ids(subject_ids, percentage=5):
    """Selects a random 5% sample of subject IDs."""
    sample_size = int(len(subject_ids) * (percentage / 100))
    return random.sample(subject_ids, sample_size)

def load_single_csv_gz(file_path, patient_ids, patient_id_col='subject_id', chunksize=100000, max_chunks=None):
    """Loads a single .csv.gz file in chunks and returns a DataFrame, filtering by patient_ids.
       Stops after processing a specified number of chunks (max_chunks)."""
    
    filtered_data = []
    patient_id_found = False  # Flag to check if patient_id column exists
    chunk_count = 0  # Initialize chunk counter

    try:
        # Read the file in chunks
        for chunk in pd.read_csv(file_path, compression='gzip', chunksize=chunksize):
            chunk_count += 1  # Increment chunk counter
            print(f"Processing chunk {chunk_count}")

            # Check for the presence of the patient_id column
            if patient_id_col in chunk.columns:
                patient_id_found = True  # Mark that we've found the patient_id column
                
                # Filter the chunk by patient_ids
                filtered_chunk = chunk[chunk[patient_id_col].isin(patient_ids)]
                filtered_data.append(filtered_chunk)
            
            # Stop processing if the max_chunks limit is reached
            if max_chunks and chunk_count >= max_chunks:
                print(f"Stopping after {chunk_count} chunks as per max_chunks limit.")
                break

        # If the patient_id column was found and filtered data exists, return that
        if patient_id_found and filtered_data:
            return pd.concat(filtered_data, ignore_index=True)
        else:
            # Return an empty DataFrame if no matching data or patient_id column not found
            print(f"Patient ID column '{patient_id_col}' not found or no matching patient IDs.")
            return pd.DataFrame()
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return pd.DataFrame()  # Return empty DataFrame on error


    
# Step 2: Save the filtered DataFrame to a .csv.gz file
def save_filtered_df(filtered_df, output_path):
    """Saves a pandas DataFrame to a .csv.gz file."""
    if filtered_df is not None and not filtered_df.empty:
        
        # Ensure the target folder exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        filtered_df.to_csv(output_path, compression='gzip', index=False)
        print(f"Filtered data saved to {output_path}")
    else:
        print(f"No data to save for {output_path}")

### Option 1: Randomly generate some subject ids to use

In [None]:
# Step 1: Load subject IDs from the file
subject_ids = load_subject_ids(icu_stays_path)

if subject_ids:
    # Step 2: Select 5% of the subject IDs randomly
    selected_subject_ids = select_random_subject_ids(subject_ids, percentage=3)
    print(f"Selected {len(selected_subject_ids)} subject IDs from {len(subject_ids)} total.")

    # Step 3: Filter the icustays file by these subject IDs
    filtered_df = load_single_csv_gz(icu_stays_path, selected_subject_ids)

    # Step 4: Save the filtered data
    save_filtered_df(filtered_df, icu_stays_path_out)
else:
    print(f"No subject IDs found in {icu_stays_path}")
    
subject_ids = selected_subject_ids

In [None]:
len(subject_ids)

### Option 2: Load the already chosen subject ids

In [None]:
subject_ids = load_subject_ids(icu_stays_path_out)

In [None]:
len(subject_ids)

## Extract the smaller dataset versions for all files

In [None]:
file_name = 'inputevents.csv.gz'
new_file_path = os.path.join(input_root_icu, file_name)
new_output_path = os.path.join(output_root_icu, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, subject_ids)[0]

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'ingredientevents.csv.gz'
new_file_path = os.path.join(input_root_icu, file_name)
new_output_path = os.path.join(output_root_icu, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, subject_ids)[0]

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'outputevents.csv.gz'
new_file_path = os.path.join(input_root_icu, file_name)
new_output_path = os.path.join(output_root_icu, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'procedureevents.csv.gz'
new_file_path = os.path.join(input_root_icu, file_name)
new_output_path = os.path.join(output_root_icu, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'chartevents.csv.gz'
new_file_path = os.path.join(input_root_icu, file_name)
new_output_path = os.path.join(output_root_icu, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'datetimeevents.csv.gz'
new_file_path = os.path.join(input_root_icu, file_name)
new_output_path = os.path.join(output_root_icu, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'd_items.csv.gz'
new_file_path = os.path.join(input_root_icu, file_name)
new_output_path = os.path.join(output_root_icu, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
filtered_df

## Load other data not matchable

In [None]:
file_name = 'd_items.csv.gz'
new_file_path = os.path.join(input_path, file_name)
new_output_path = os.path.join(output_path, file_name)

In [None]:
filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)

# Save the filtered data
# save_filtered_df(filtered_df, new_output_path)

In [None]:
filtered_df

## Load Hosp data

In [None]:
file_name = 'patients.csv.gz'
new_file_path = os.path.join(input_root_hosp, file_name)
new_output_path = os.path.join(output_root_hosp, file_name)

In [None]:
filtered_df = load_single_csv_gz(new_file_path, subject_ids)
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'admissions.csv.gz'
new_file_path = os.path.join(input_root_hosp, file_name)
new_output_path = os.path.join(output_root_hosp, file_name)

In [None]:
filtered_df = load_single_csv_gz(new_file_path, subject_ids)
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'diagnoses_icd.csv.gz'
new_file_path = os.path.join(input_root_hosp, file_name)
new_output_path = os.path.join(output_root_hosp, file_name)


filtered_df = load_single_csv_gz(new_file_path, subject_ids)
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'labevents.csv.gz'
new_file_path = os.path.join(input_root_hosp, file_name)
new_output_path = os.path.join(output_root_hosp, file_name)


filtered_df = load_single_csv_gz(new_file_path, subject_ids)
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'microbiologyevents.csv.gz'
new_file_path = os.path.join(input_root_hosp, file_name)
new_output_path = os.path.join(output_root_hosp, file_name)


filtered_df = load_single_csv_gz(new_file_path, subject_ids)
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'prescriptions.csv.gz'
new_file_path = os.path.join(input_root_hosp, file_name)
new_output_path = os.path.join(output_root_hosp, file_name)


filtered_df = load_single_csv_gz(new_file_path, subject_ids)
save_filtered_df(filtered_df, new_output_path)