In [1]:
import pandas as pd
import os
import random

In [2]:
# Hardcode your file paths here
input_path = '../data/real_world_data/physionet.org/files/mimiciv/3.0/icu/'
output_path = '../data/real_world_data/physionet.org_small/files/mimiciv/3.0/icu/'  # Path to save the filtered file
file_name = 'icustays.csv.gz'
file_path = os.path.join(input_path, file_name)
file_path_out = os.path.join(output_path, file_name)

In [None]:
# Step 1: Load unique subject IDs
def load_subject_ids(file_path, subject_id_col='subject_id', chunksize=100000):
    """Loads subject IDs from a .csv.gz file in chunks."""
    subject_ids = set()  # Use a set to avoid duplicates
    
    try:
        # Load the file in chunks and collect all subject_ids
        for chunk in pd.read_csv(file_path, compression='gzip', chunksize=chunksize):
            subject_ids.update(chunk[subject_id_col].unique())  # Add unique subject_ids to the set
        
        return list(subject_ids)  # Convert to a list to make it easier to sample
    except Exception as e:
        print(f"Error loading subject IDs from {file_path}: {e}")
        return None

# Step 2: Select a random 5% of subject IDs
def select_random_subject_ids(subject_ids, percentage=5):
    """Selects a random 5% sample of subject IDs."""
    sample_size = int(len(subject_ids) * (percentage / 100))
    return random.sample(subject_ids, sample_size)

def load_single_csv_gz(file_path, patient_ids, patient_id_col='subject_id', chunksize=100000):
    """Loads a single .csv.gz file in chunks and returns a DataFrame, filtering by patient_ids if the column exists."""
    filtered_data = []
    all_data = []  # To collect data regardless of patient_id presence
    patient_id_found = False  # Flag to check if patient_id column exists

    try:
        # Read the file in chunks
        for chunk in pd.read_csv(file_path, compression='gzip', chunksize=chunksize):
            # Display a preview of the first few rows of the chunk
            print(chunk.head())  # Show the first few rows to inspect data
            
            all_data.append(chunk)  # Collect all chunks
            
            # Check for the presence of the patient_id column
            if patient_id_col in chunk.columns:
                patient_id_found = True  # Mark that we've found the patient_id column
                
                # Filter the chunk by patient_ids
                filtered_chunk = chunk[chunk[patient_id_col].isin(patient_ids)]
                filtered_data.append(filtered_chunk)

        # Concatenate all collected chunks into one DataFrame
        if all_data:
            all_data_df = pd.concat(all_data, ignore_index=True)  # Return the full DataFrame regardless

        # If the patient_id column was found and filtered data exists, return that
        if patient_id_found and filtered_data:
            return pd.concat(filtered_data, ignore_index=True), all_data_df
        else:
            return pd.DataFrame(), all_data_df  # Return empty DataFrame for filtered, but return all data

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, None  # Return None if there's an error

    
# Step 2: Save the filtered DataFrame to a .csv.gz file
def save_filtered_df(filtered_df, output_path):
    """Saves a pandas DataFrame to a .csv.gz file."""
    if filtered_df is not None and not filtered_df.empty:
        
        # Ensure the target folder exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        filtered_df.to_csv(output_path, compression='gzip', index=False)
        print(f"Filtered data saved to {output_path}")
    else:
        print(f"No data to save for {output_path}")
        
        
# Step 3: Load a .csv.gz file and filter by the selected patient IDs
# def load_single_csv_gz(file_path, patient_ids, patient_id_col='subject_id', chunksize=100000):
#    """Loads a single .csv.gz file in chunks and filters by patient_ids."""
#    filtered_data = []
#    
#    try:
#        # Read the file in chunks and filter by patient_ids
#        for chunk in pd.read_csv(file_path, compression='gzip', chunksize=chunksize):
#            filtered_chunk = chunk[chunk[patient_id_col].isin(patient_ids)]
#            filtered_data.append(filtered_chunk)
#        
#        if filtered_data:
#            # Concatenate all filtered chunks into one DataFrame
#            return pd.concat(filtered_data, ignore_index=True)
#        else:
#            return pd.DataFrame()  # Return an empty DataFrame if no matches found
#    except Exception as e:
#        print(f"Error processing {file_path}: {e}")
#        return None

## Step 4: Save the filtered DataFrame to a .csv.gz file
#def save_filtered_df(filtered_df, output_path):
#    """Saves a pandas DataFrame to a .csv.gz file."""
#    if filtered_df is not None and not filtered_df.empty:
#        
#        # Ensure the target folder exists
#       os.makedirs(os.path.dirname(output_path), exist_ok=True)
#        
#        filtered_df.to_csv(output_path, compression='gzip', index=False)
#        print(f"Filtered data saved to {output_path}")
#    else:
#        print(f"No data to save for {output_path}")



In [None]:
subject_ids = load_subject_ids(file_path)

### Randomly generate some subject ids to use

In [None]:
# Step 1: Load subject IDs from the file
subject_ids = load_subject_ids(file_path)

if subject_ids:
    # Step 2: Select 5% of the subject IDs randomly
    selected_subject_ids = select_random_subject_ids(subject_ids, percentage=5)
    print(f"Selected {len(selected_subject_ids)} subject IDs from {len(subject_ids)} total.")

    # Step 3: Filter the icustays file by these subject IDs
    filtered_df = load_single_csv_gz(file_path, selected_subject_ids)

    # Step 4: Save the filtered data
    output_file = os.path.join(output_path, file_name)
    save_filtered_df(filtered_df, output_file)
else:
    print(f"No subject IDs found in {file_path}")

### Alternatively: Load the already chosen subject ids

In [None]:
subject_ids = load_subject_ids(file_path_out)

In [None]:
selected_subject_ids = subject_ids 

## Extract the smaller daataset versions for all files

In [None]:
file_name = 'inputevents.csv.gz'
new_file_path = os.path.join(input_path, file_name)
new_output_path = os.path.join(output_path, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'ingredientevents.csv.gz'
new_file_path = os.path.join(input_path, file_name)
new_output_path = os.path.join(output_path, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'outputevents.csv.gz'
new_file_path = os.path.join(input_path, file_name)
new_output_path = os.path.join(output_path, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'procedureevents.csv.gz'
new_file_path = os.path.join(input_path, file_name)
new_output_path = os.path.join(output_path, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'chartevents.csv.gz'
new_file_path = os.path.join(input_path, file_name)
new_output_path = os.path.join(output_path, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'datetimeevents.csv.gz'
new_file_path = os.path.join(input_path, file_name)
new_output_path = os.path.join(output_path, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'd_items.csv.gz'
new_file_path = os.path.join(input_path, file_name)
new_output_path = os.path.join(output_path, file_name)

In [None]:
# Load and filter the new file by the selected subject IDs
filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)

# Save the filtered data
save_filtered_df(filtered_df, new_output_path)

## Load other data not matchable

In [None]:
file_name = 'd_items.csv.gz'
new_file_path = os.path.join(input_path, file_name)
new_output_path = os.path.join(output_path, file_name)

In [None]:
filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)

# Save the filtered data
# save_filtered_df(filtered_df, new_output_path)

In [None]:
filtered_df

## Load Hosp data

In [None]:
# Hardcode your file paths here
input_path_hosp = '../data/real_world_data/physionet.org/files/mimiciv/3.0/hosp/'
output_path_hosp = '../data/real_world_data/physionet.org_small/files/mimiciv/3.0/hosp/'  # Path to save the filtered file
# file_path = os.path.join(input_path, file_name)


In [None]:
file_name = 'patients.csv.gz'
new_file_path = os.path.join(input_path_hosp, file_name)
new_output_path = os.path.join(output_path_hosp, file_name)

In [None]:
filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)[0]
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'admissions.csv.gz'
new_file_path = os.path.join(input_path_hosp, file_name)
new_output_path = os.path.join(output_path_hosp, file_name)

In [None]:
filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)[0]
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'diagnoses_icd.csv.gz'
new_file_path = os.path.join(input_path_hosp, file_name)
new_output_path = os.path.join(output_path_hosp, file_name)


filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)[0]
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'diagnoses_icd.csv.gz'
new_file_path = os.path.join(input_path_hosp, file_name)
new_output_path = os.path.join(output_path_hosp, file_name)


filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)[0]
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'labevents.csv.gz'
new_file_path = os.path.join(input_path_hosp, file_name)
new_output_path = os.path.join(output_path_hosp, file_name)


filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)[0]
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'microbiologyevents.csv.gz'
new_file_path = os.path.join(input_path_hosp, file_name)
new_output_path = os.path.join(output_path_hosp, file_name)


filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)[0]
save_filtered_df(filtered_df, new_output_path)

In [None]:
file_name = 'prescriptions.csv.gz'
new_file_path = os.path.join(input_path_hosp, file_name)
new_output_path = os.path.join(output_path_hosp, file_name)


filtered_df = load_single_csv_gz(new_file_path, selected_subject_ids)[0]
save_filtered_df(filtered_df, new_output_path)