In [1]:
# MIMIC IV Nth attempt 
import pandas as pd
import os

# Step 1: Define Data Path for Local Execution
# Modify this path to the location of your MIMIC-IV data on your hard drive
data_path = "D:/mimic-iv-3.1"

In [3]:
import pandas as pd
import os
import psutil
import dask.dataframe as dd

# Step 1: Define Data Path for Local Execution
data_path = "D:/mimic-iv-3.1"

# Step 2: Check memory usage before loading data
def check_memory():
    print(f"Memory Usage: {psutil.virtual_memory().percent}%")

check_memory()

Memory Usage: 30.4%


In [5]:
# Step 3: Load MIMIC-IV Tables with optimized memory handling
# Load small tables directly
admissions = pd.read_csv(os.path.join(data_path, "hosp/admissions.csv.gz"), 
                          usecols=["subject_id", "hadm_id", "admittime", "dischtime"], 
                          low_memory=False)

patients = pd.read_csv(os.path.join(data_path, "hosp/patients.csv.gz"), 
                        usecols=["subject_id", "gender", "anchor_age", "anchor_year", "dod"], 
                        low_memory=False)

icustays = pd.read_csv(os.path.join(data_path, "icu/icustays.csv.gz"), 
                        usecols=["subject_id", "hadm_id", "stay_id", "intime", "outtime"], 
                        low_memory=False)

diagnoses = pd.read_csv(os.path.join(data_path, "hosp/diagnoses_icd.csv.gz"), 
                         usecols=["subject_id", "hadm_id", "icd_code"], 
                         low_memory=False)

In [7]:
# Load large tables in chunks
chunk_size = 250000  # Adjust based on system memory

prescriptions_iter = pd.read_csv(os.path.join(data_path, "hosp/prescriptions.csv.gz"), 
                                  usecols=["subject_id", "hadm_id", "drug"], 
                                  dtype=str, 
                                  low_memory=False, 
                                  chunksize=chunk_size)
prescriptions = pd.concat(prescriptions_iter)


In [9]:
chunk_size = 250000  # Adjust based on system memory
lab_events_iter = pd.read_csv(os.path.join(data_path, "hosp/labevents.csv.gz"), 
                               usecols=["subject_id", "hadm_id", "itemid", "valuenum"], 
                               low_memory=False, 
                               chunksize=chunk_size)
lab_events = pd.concat(lab_events_iter)

In [11]:
chunk_size = 250000  # Adjust based on system memory
chartevents_iter = pd.read_csv(os.path.join(data_path, "icu/chartevents.csv.gz"), 
                                usecols=["subject_id", "hadm_id", "itemid", "valuenum"], 
                                low_memory=False, 
                                chunksize=chunk_size)
chartevents = pd.concat(chartevents_iter)

In [12]:
# Step 4: Check memory usage after loading data
check_memory()

# Step 5: Save Processed Data for ML Modeling
output_path = "D:/MIMIC-IV-Data-Pipeline/processed_data"
os.makedirs(output_path, exist_ok=True)

admissions.to_csv(os.path.join(output_path, "admissions_processed.csv.gz"), index=False, compression='gzip')
patients.to_csv(os.path.join(output_path, "patients_processed.csv.gz"), index=False, compression='gzip')
icustays.to_csv(os.path.join(output_path, "icustays_processed.csv.gz"), index=False, compression='gzip')
diagnoses.to_csv(os.path.join(output_path, "diagnoses_processed.csv.gz"), index=False, compression='gzip')
prescriptions.to_csv(os.path.join(output_path, "prescriptions_processed.csv.gz"), index=False, compression='gzip')
lab_events.to_csv(os.path.join(output_path, "lab_events_processed.csv.gz"), index=False, compression='gzip')
chartevents.to_csv(os.path.join(output_path, "chartevents_processed.csv.gz"), index=False, compression='gzip')

print("✅ Processed dataset saved successfully!")

Memory Usage: 59.8%
✅ Processed dataset saved successfully!


In [15]:
# Load item labels to be merged later
d_items = pd.read_csv(os.path.join(data_path, "icu/d_items.csv.gz"), usecols=["itemid", "label"], low_memory=False)
d_labitems = pd.read_csv(os.path.join(data_path, "hosp/d_labitems.csv.gz"), usecols=["itemid", "label"], low_memory=False)
d_procedures = pd.read_csv(os.path.join(data_path, "hosp/d_icd_procedures.csv.gz"), usecols=["icd_code", "long_title"], low_memory=False)

# Step 4a: Check memory usage after loading data
check_memory()

Memory Usage: 74.6%


In [17]:
# Step 5a: Merge Data for Delirium Prediction
core_data = pd.merge(admissions, patients, on="subject_id", how="inner")
core_data = pd.merge(core_data, icustays, on=["subject_id", "hadm_id"], how="left")
core_data = pd.merge(core_data, diagnoses, on=["subject_id", "hadm_id"], how="left")

In [19]:
# Identify Delirium Cases Using ICD Codes
delirium_icd_codes = ['F05', '293.0', '293.1']  # ICD-10 & ICD-9 codes
core_data["delirium"] = core_data["icd_code"].isin(delirium_icd_codes).astype(int)

# Comorbidity Identification
comorbidity_count = diagnoses.groupby(["subject_id", "hadm_id"])['icd_code'].nunique().reset_index()
comorbidity_count.rename(columns={'icd_code': 'num_comorbidities'}, inplace=True)
core_data = pd.merge(core_data, comorbidity_count, on=["subject_id", "hadm_id"], how="left")

In [21]:
# Merge ICU procedures
diagnoses_labeled = pd.merge(diagnoses, d_procedures, on="icd_code", how="left")
primary_procedure = diagnoses_labeled.groupby(["subject_id", "hadm_id"])['long_title'].first().reset_index()
primary_procedure.rename(columns={'long_title': 'primary_procedure'}, inplace=True)
core_data = pd.merge(core_data, primary_procedure, on=["subject_id", "hadm_id"], how="left")

# Merge lab events with labels
lab_events_labeled = pd.merge(lab_events, d_labitems, on="itemid", how="left")
lab_events_median = lab_events_labeled.groupby(["subject_id", "hadm_id", "label"])["valuenum"].median().reset_index()
lab_events_pivot = lab_events_median.pivot(index=["subject_id", "hadm_id"], columns="label", values="valuenum").reset_index()
core_data = pd.merge(core_data, lab_events_pivot, on=["subject_id", "hadm_id"], how="left")

✅ Delirium prediction dataset saved successfully!


In [30]:

# Step 6: Save Processed Data for ML Modeling
output_path = "D:/MIMIC-IV-Data-Pipeline/processed_data"
os.makedirs(output_path, exist_ok=True)
core_data.to_csv(os.path.join(output_path, "delirium_prediction_data.csv.gz"), index=False, compression='gzip')

print("✅ Delirium prediction dataset saved successfully!")

✅ Delirium prediction dataset saved successfully!


In [5]:
import pandas as pd

file_path = "D:/MIMIC-IV-Data-Pipeline/processed_data/delirium_prediction_data.csv.gz"

df_sample = pd.read_csv(file_path, compression="gzip", nrows=5)  # Load first 5 rows to inspect
print(df_sample.dtypes)


subject_id           int64
hadm_id              int64
admittime           object
dischtime           object
gender              object
                    ...   
pH                 float64
pO2                float64
proBNP, Pleural    float64
tacroFK            float64
wbcp               float64
Length: 728, dtype: object


In [None]:
#EDA redone 
import pandas as pd

file_path = "D:/MIMIC-IV-Data-Pipeline/processed_data/delirium_prediction_data.csv.gz"
print("File Exists:", os.path.exists(file_path))

# Load dataset correctly
df = pd.read_csv(file_path, compression="gzip")

print("✅ File loaded successfully! Shape:", df.shape)
print("Columns:", df.columns)

File Exists: True


In [11]:
#EDA
#Install necessary packages (if not already installed)
!pip install pandas matplotlib seaborn

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
 



In [9]:
df_sample = pd.read_csv(file_path, compression="gzip", nrows=5)
print(df_sample.head())


   subject_id   hadm_id            admittime            dischtime gender  \
0    10000032  22595853  2180-05-06 22:23:00  2180-05-07 17:15:00      F   
1    10000032  22595853  2180-05-06 22:23:00  2180-05-07 17:15:00      F   
2    10000032  22595853  2180-05-06 22:23:00  2180-05-07 17:15:00      F   
3    10000032  22595853  2180-05-06 22:23:00  2180-05-07 17:15:00      F   
4    10000032  22595853  2180-05-06 22:23:00  2180-05-07 17:15:00      F   

   anchor_age  anchor_year         dod  stay_id  intime  ...  \
0          52         2180  2180-09-09      NaN     NaN  ...   
1          52         2180  2180-09-09      NaN     NaN  ...   
2          52         2180  2180-09-09      NaN     NaN  ...   
3          52         2180  2180-09-09      NaN     NaN  ...   
4          52         2180  2180-09-09      NaN     NaN  ...   

   dRVVT - Confirmation  dRVVT - Normalized Ratio  dRVVT - Screen  eAG pCO2  \
0                   NaN                       NaN             NaN  NaN  NaN   


In [15]:
# Define the dataset path  
file_path = "D:/MIMIC-IV-Data-Pipeline/processed_data/delirium_prediction_data.csv.gz"

# Load the dataset
df = pd.read_csv(file_path)  #gives memory error

  df = pd.read_csv(file_path)


MemoryError: Unable to allocate 35.2 GiB for an array with shape (715, 6599888) and data type float64

In [None]:
df = pd.read_csv(
    file_path, compression="gzip", usecols=selected_columns, low_memory=False,
    dtype={"delirium": "int8", "anchor_age": "int16", "num_comorbidities": "int8"}
)
chunk_size = 20000  # Adjust based on system capacity
df_chunks = pd.read_csv(file_path, compression="gzip", usecols=selected_columns, chunksize=chunk_size)

df = pd.concat(df_chunks)  # Combine chunks after processing


In [21]:
import pandas as pd

file_path = "D:/MIMIC-IV-Data-Pipeline/processed_data/delirium_prediction_data.csv.gz"

df_sample = pd.read_csv(file_path, compression="gzip", nrows=5)  # Load only first 5 rows
print(df_sample.columns)


Index(['subject_id', 'hadm_id', 'admittime', 'dischtime', 'gender',
       'anchor_age', 'anchor_year', 'dod', 'stay_id', 'intime',
       ...
       'dRVVT - Confirmation', 'dRVVT - Normalized Ratio', 'dRVVT - Screen',
       'eAG', 'pCO2', 'pH', 'pO2', 'proBNP, Pleural', 'tacroFK', 'wbcp'],
      dtype='object', length=728)


In [13]:
# 🔹 Basic Data Overview
print("✅ Dataset Loaded Successfully!")
print(f"Shape of dataset: {df.shape}")
print("\nMissing Values:")
print(df.isnull().sum())
print("\nData Types:")
print(df.dtypes)
print("\nUnique Values Per Column:")
print(df.nunique())

# 🔹 Summary Statistics
print("\nSummary Statistics:")
print(df.describe())

# 🔹 Class Distribution (Delirium vs. Non-Delirium)
plt.figure(figsize=(6,4))
sns.countplot(x="delirium", data=df, palette="coolwarm")
plt.title("Distribution of Delirium Cases")
plt.xlabel("Delirium (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.show()


✅ Dataset Loaded Successfully!


NameError: name 'df' is not defined

In [None]:
# 🔹 Histograms for Numeric Features
df.hist(figsize=(12, 10), bins=30)
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()

# 🔹 Boxplots for Key Clinical Features
clinical_features = ["anchor_age", "num_comorbidities"]
plt.figure(figsize=(12, 6))
for i, feature in enumerate(clinical_features, 1):
    plt.subplot(1, len(clinical_features), i)
    sns.boxplot(x="delirium", y=feature, data=df)
    plt.title(f"{feature} vs Delirium")

plt.tight_layout()
plt.show()

# 🔹 Feature Correlation Heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

print("✅ EDA Completed Successfully!")

In [None]:
# DO NOT RUN THIS CELL YET 
#Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Define dataset path (Update if necessary)
file_path = "D:/MIMIC-IV-Data-Pipeline/processed_data/delirium_prediction_data.csv.gz"

# Load dataset
df = pd.read_csv(file_path)

# Create output directory
output_dir = "D:/MIMIC-IV-Data-Pipeline/EDA_Results"
os.makedirs(output_dir, exist_ok=True)

# ✅ Save Basic Data Overview
eda_summary = {
    "Shape": df.shape,
    "Missing Values": df.isnull().sum(),
    "Data Types": df.dtypes,
    "Unique Values": df.nunique()
}
eda_summary_df = pd.DataFrame(eda_summary)
eda_summary_df.to_csv(os.path.join(output_dir, "eda_summary.csv"))

# ✅ Save Summary Statistics
df.describe().to_csv(os.path.join(output_dir, "summary_statistics.csv"))

# ✅ Save Class Distribution
class_distribution = df["delirium"].value_counts(normalize=True)
class_distribution.to_csv(os.path.join(output_dir, "class_distribution.csv"))

# ✅ Save Feature Correlation Matrix
correlation_matrix = df.corr()
correlation_matrix.to_csv(os.path.join(output_dir, "feature_correlations.csv"))

# 🔹 Save Plots
# Class Distribution
plt.figure(figsize=(6,4))
sns.countplot(x="delirium", data=df, palette="coolwarm")
plt.title("Distribution of Delirium Cases")
plt.xlabel("Delirium (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.savefig(os.path.join(output_dir, "class_distribution.png"))
plt.close()

# Histograms for Numeric Features
df.hist(figsize=(12, 10), bins=30)
plt.suptitle("Feature Distributions", fontsize=16)
plt.savefig(os.path.join(output_dir, "feature_histograms.png"))
plt.close()

# Boxplots for Key Clinical Features
clinical_features = ["anchor_age", "num_comorbidities"]
plt.figure(figsize=(12, 6))
for i, feature in enumerate(clinical_features, 1):
    plt.subplot(1, len(clinical_features), i)
    sns.boxplot(x="delirium", y=feature, data=df)
    plt.title(f"{feature} vs Delirium")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "boxplots.png"))
plt.close()

# Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.savefig(os.path.join(output_dir, "correlation_heatmap.png"))
plt.close()

print(f"✅ EDA completed! Results saved in: {output_dir}")
