# Data Preprocessing

In [None]:
import os
import sys

# Path to the dataset zip file
data_folder = "./data"

project_root = os.path.abspath(os.path.join(os.getcwd(), "../../.."))
sys.path.append(project_root)

from examples.minv.mimic.utils.preprocess_mimic_data import preprocess_data, extract_features_and_split

input_path = os.path.join(data_folder, "df.pkl")
lab_events_path = os.path.join(data_folder, "lab_events_grouped.pkl")

# List of all possible continuous columns in MIMIC
# Note: Not all of these columns are guaranteed to be present in the dataset after feature selection
continuous_col_names = ['length_of_stay', 'num_procedures', 'num_medications', 'BMI',
       'BMI (kg/m2)', 'Height', 'Height (Inches)', 'Weight', 'Weight (Lbs)',
       'eGFR', 'systolic', 'diastolic']

# Creates processed_data.pkl
processed_path = preprocess_data(input_path, lab_events_path, continuous_col_names, mean_imputation=True)

# Feature selection
desired_num_unique_classes = 35
# Creates private private_df.pkl and public_df.pkl
extract_features_and_split(processed_path, desired_num_unique_classes, print_classification_reports=False)


# Target Model Training

In [None]:
import os
import sys
import yaml
import warnings
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, TabNetModelConfig, GANDALFConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
import pandas as pd

# Suppres warnings, pytorch_tabular is very verbose
warnings.filterwarnings("ignore")

# Redefine variables in case upper cell is not run
# Path to the dataset zip file
data_folder = "./data"

project_root = os.path.abspath(os.path.join(os.getcwd(), "../../.."))
sys.path.append(project_root)

# List of all possible continuous columns in MIMIC
continuous_col_names = ['length_of_stay', 'num_procedures', 'num_medications', 'BMI',
       'BMI (kg/m2)', 'Height', 'Height (Inches)', 'Weight', 'Weight (Lbs)',
       'eGFR', 'systolic', 'diastolic']


# Load the config.yaml file
with open('train_config.yaml', 'r') as file:
    train_config = yaml.safe_load(file)

with open('audit.yaml', 'r') as file:
    audit_config = yaml.safe_load(file)

num_classes = audit_config["audit"]["attack_list"]["plgmi"]["num_classes"]

# Generate the dataset and dataloaders
path = os.path.join(os.getcwd(), train_config["data"]["data_dir"])
data_dir =  train_config["data"]["data_dir"] + "/private_df.pkl"

df = pd.read_pickle(data_dir)

# Reset index to have a clean, sequential integer index
df = df.reset_index(drop=True)

# Remove the columns from continuous_col_names that are not in the dataframe
continuous_col_names = [col for col in continuous_col_names if col in df.columns]
# Categorical column names are all columns that are not continuous
categorical_col_names = [col for col in df.columns if col not in continuous_col_names]
# Remove the target column
categorical_col_names.remove("identity")

# Ensure df_train contains at least one sample for every class:
df_train_min = df.groupby("identity").head(1)  # uses the new, clean index
remaining_df = df.drop(df_train_min.index)      # Now, the indices align perfectly

# Determine the fraction for the remaining samples:
desired_frac = train_config["data"]["f_train"]
frac_remaining = desired_frac - (len(df_train_min) / len(df))
df_train_remaining = remaining_df.sample(frac=frac_remaining, random_state=123)

# Merge the guaranteed and random samples.
# Note: we keep the original indices here (do not use ignore_index) so that we can compute df_val correctly.
train_indices = df_train_min.index.union(df_train_remaining.index)
df_train = df.loc[train_indices]

# Create df_val by taking the rest of the samples
df_test = df.drop(train_indices)
df_test = df_test[df_test["identity"].isin(df_train["identity"])]
df_test = df_test.reset_index(drop=True)

# Prints
print("Number of unique classes in df_train: ", df_train["identity"].nunique())
print("Shape of df_train: ", df_train.shape)
print("Shape of df_test: ", df_test.shape)


data_config = DataConfig(
    target=['identity'],
    continuous_cols=continuous_col_names,
    categorical_cols=categorical_col_names,
    normalize_continuous_features=False,
)

trainer_config = TrainerConfig(
    auto_lr_find=False,
    batch_size=256,
    max_epochs=100,
    early_stopping='train_loss_0',
)

optimizer_config = OptimizerConfig()

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="2048-1024-512-256",
    activation="ReLU",
    learning_rate=1e-3,
)

# model_config = GANDALFConfig(
# task="classification",
# gflu_stages=16,
# gflu_dropout=0.1,
# embedding_dropout=0.1,
# learning_rate=1e-3,
# )

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)

tabular_model.fit(train=df_train) # Defaults 80% train, 20% val split
results = tabular_model.evaluate(df_test)
pred_df = tabular_model.predict(df_test.drop(columns=["identity"]))

# Save the model
tabular_model.save_model("./target/")

# Plot training/val loss

In [None]:
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import matplotlib.pyplot as plt

log_dir = tabular_model.trainer.logger.log_dir

# Path to the TensorBoard log directory
event_file = f"{log_dir}/events.out.tfevents.*"

# Load the event file
event_acc = EventAccumulator(log_dir)
event_acc.Reload()

# List all available scalar metrics
print("Available metrics:", event_acc.Tags()["scalars"])
# Extract training and validation loss
train_acc = [scalar.value for scalar in event_acc.Scalars("train_accuracy")]
val_acc = [scalar.value for scalar in event_acc.Scalars("valid_accuracy")]
test_acc = [scalar.value for scalar in event_acc.Scalars("test_accuracy")]

print("Test accuracy: ", test_acc[0])

# Plot the metrics
plt.figure(figsize=(10, 6))
plt.plot(train_acc, label="Training Loss")
plt.plot(val_acc, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Over Epochs")
plt.legend()
plt.show()


# LeakPro

In [None]:
import os
import sys
import warnings

# Suppres warnings, pytorch_tabular is very verbose
warnings.filterwarnings("ignore")
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../.."))
sys.path.append(project_root)

from leakpro import LeakPro
from examples.minv.mimic.mimic_plgmi_handler import Mimic_InputHandler
config_path = "audit.yaml"

# Initialize the LeakPro object
leakpro = LeakPro(Mimic_InputHandler, config_path)

# Run the audit
results = leakpro.run_audit(return_results=True)

In [None]:
# If gower distance test was run, execute this cell to see best rows
print(results[0].best_rows)

# Plots

In [None]:
import plotly.io as pio
pio.renderers.default = 'notebook'

for result in results[0].numerical_plots:
    # Get the plot
    plot = results[0].numerical_plots[result]
    # Show the plot
    plot.show()


for result in results[0].categorical_plots:
    # Get the plot
    plot = results[0].categorical_plots[result]
    # Show the plot
    plot.show()

In [None]:
results[0].results['quality_report'].get_details(property_name='Column Pair Trends')

In [None]:
import plotly.io as pio
pio.renderers.default = 'notebook'

fig = results[0].results['quality_report'].get_visualization(property_name="Column Shapes")
fig.show()