# Load and inspect the raw MIMIC-IV data.
This notebook handles the loading and inspection of raw MIMIC-IV data files. The outputs are saved for further processing in subsequent notebooks.


In [2]:
import os
import pandas as pd

In [None]:
# Define the data directory
data_dir = "data"

# List all files in the data directory and its subdirectories
for root, dirs, files in os.walk(data_dir):
    print(f"Directory: {root}")
    for file in files:
        print(f"  - {file}")

In [None]:
# File paths for key tables
file_paths = {
    "admissions": f"{data_dir}/hosp/admissions.csv.gz",
    "patients": f"{data_dir}/hosp/patients.csv.gz",
    "diagnoses": f"{data_dir}/hosp/diagnoses_icd.csv.gz",
    "labevents": f"{data_dir}/hosp/labevents.csv.gz",
    "procedures": f"{data_dir}/hosp/procedures_icd.csv.gz",
    "prescriptions": f"{data_dir}/hosp/prescriptions.csv.gz",
}

# Load tables into Pandas DataFrames
admissions = pd.read_csv(file_paths["admissions"])
patients = pd.read_csv(file_paths["patients"])
diagnoses = pd.read_csv(file_paths["diagnoses"])
labevents = pd.read_csv(file_paths["labevents"])
procedures = pd.read_csv(file_paths["procedures"])
prescriptions = pd.read_csv(file_paths["prescriptions"])

## Summarize table structures and validate key columns.

In [None]:
# Function to inspect DataFrame
def inspect_data(name, df):
    print(f"\n{name} Dataset:")
    print(f"Shape: {df.shape}")
    print("Columns:")
    print(df.columns)
    print("\nSample Data:")
    print(df.head())
    print("\nMissing Values:")
    print(df.isnull().sum())

# Inspect each dataset
inspect_data("Admissions", admissions)
inspect_data("Patients", patients)
inspect_data("Diagnoses", diagnoses)
inspect_data("Lab Events", labevents)
inspect_data("Procedures", procedures)
inspect_data("Prescriptions", prescriptions)

## Save the loaded data as processed versions for further use.

In [None]:
# Save loaded tables as uncompressed .csv for faster access in future steps
output_dir = "processed_data"
os.makedirs(output_dir, exist_ok=True)

admissions.to_csv(f"{output_dir}/admissions.csv", index=False)
patients.to_csv(f"{output_dir}/patients.csv", index=False)
diagnoses.to_csv(f"{output_dir}/diagnoses.csv", index=False)
labevents.to_csv(f"{output_dir}/labevents.csv", index=False)
procedures.to_csv(f"{output_dir}/procedures.csv", index=False)
prescriptions.to_csv(f"{output_dir}/prescriptions.csv", index=False)

print("Data saved to processed_data directory.")