In [None]:
import random
random.seed(42)

import torch
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

In [None]:
import numpy as np
import pandas as pd
import shap
import tqdm

from sklearn.base import clone
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.model_selection import GroupShuffleSplit
import sklearn.metrics as skmetrics
from sklearn.calibration import CalibrationDisplay
from sklearn.utils import resample

from skopt import BayesSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

from process_fe import create_feature_engineering_datasets
from data import data_to_array_dict, get_data_date_split, get_data_date_id_split, get_feature_colnames
from utils import stratification
from plotting import paper_theme, ReliabilityDisplay, ShapDisplay, risk_feature_plot
import metrics
from shap_calculator import calc_shap_df

from tqdm_style import tqdm_style
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score
import numpy as np
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, Subset
import torch
import torch.nn as nn
import torch.optim as optim

## Fixing the training set and predicting on the last year

In [None]:
N_DAYS = 3
THRESHOLDS = [0.3, 0.8]
DATES_SPLIT = {
    "date_train_start": "2021-06-28",
    "date_train_end": "2023-01-01",
    "date_test_end": "2024-01-01",
}

In [None]:
col_nice_names = {
    'awake_freq': 'Night time Awake Frequency', 
    'bathroom_daytime_freq': 'Daytime Bathroom Frequency', 
    'bathroom_daytime_freq_ma': 'Daytime Bathroom Frequency MA', 
    'bathroom_daytime_freq_ma_delta': 'Daytime Bathroom Frequency MA Delta', 
    'bathroom_freq': 'Bathroom Frequency',
    'bathroom_nighttime_freq': 'Night time Bathroom Frequency', 
    'bathroom_nighttime_freq_ma': 'Night time Bathroom Frequency MA', 
    'bathroom_nighttime_freq_ma_delta': 'Night time Bathroom Frequency MA Delta', 
    'bathroom_relative_transition_time_delta_mean': 'Mean Relative Bathroom Transition Time Delta',
    'bathroom_relative_transition_time_delta_std': 'STD Relative Bathroom Transition Time Delta',
    'bedroom_freq': 'Bedroom Frequency',
    'daily_entropy': 'Daily Entropy', 
    'hallway_freq': 'Hallway Frequency', 
    'heart_rate_mean': 'Mean Night Time Heart Rate',
    'heart_rate_std': 'STD Night Time Heart Rate', 
    'kitchen_freq': 'Kitchen Frequency', 
    'lounge_freq': 'Lounge Frequency', 
    'previous_uti': 'Number of Previous UTIs',
    'respiratory_rate_mean': 'Mean Night Time Respiratory Rate', 
    'respiratory_rate_std': 'STD Night Time Respiratory Rate',
}

## A MLP

In [None]:
fe_data = create_feature_engineering_datasets(reload=False)

In [None]:
print(fe_data.columns)

In [None]:

data_train, data_test, _ = get_data_date_split(
    fe_data, dates_split=DATES_SPLIT, n_days=N_DAYS, impute=True
)


X_train, y_train, ids_train, sample_weight = (
    data_train['X'], data_train['y'], data_train["id"], data_train['sample_weight']
)

X_test, y_test, ids_test, dates_test = (
    data_test['X'], data_test['y'], data_test["id"], data_test['date']
)

In [None]:
# Convert IDs to sets
set_ids_train = set(ids_train)
set_ids_test = set(ids_test)

# Check for intersection
common_ids = set_ids_train.intersection(set_ids_test)

# Check if there are any common elements
if common_ids:
    print(f"There are {len(common_ids)} common IDs between train and test datasets.")
else:
    print("IDs in train and test datasets are unique to each other.")

In [None]:
# List of IDs to remove
ids_to_remove = ['AboZyUBeiQW3nVCcbXGpay', 'NZjrVTZQR1w9LPJMt26MbG', 'XVb8nztyc2LYPCAewZq11S', 'XdbAAiDw1vd3Bjbo9EVo1B']

# Create a boolean index where False indicates IDs that need to be removed
indices_to_keep = ~np.isin(data_test['id'], ids_to_remove)

# Use this index to filter all related arrays in data_test
data_test['X'] = data_test['X'][indices_to_keep]
data_test['y'] = data_test['y'][indices_to_keep]
data_test['id'] = data_test['id'][indices_to_keep]
data_test['date'] = data_test['date'][indices_to_keep] if 'date' in data_test else None

X_test, y_test, ids_test, dates_test = (
    data_test['X'], data_test['y'], data_test["id"], data_test['date']
)

In [None]:
# Convert the NumPy array to a pandas Series
ids_train_series = pd.Series(ids_train)

# Now you can use the nunique() method
unique_count = ids_train_series.nunique()

print("Number of unique elements in ids_train:", unique_count)

In [None]:
ids_train_series = pd.Series(ids_test)

# Now you can use the nunique() method
unique_count = ids_train_series.nunique()

print("Number of unique elements in ids_test:", unique_count)

In [None]:
# Convert lists to sets
set_ids_train = set(ids_train)
set_ids_test = set(ids_test)

# Identify unique IDs in the test set
unique_ids_in_test = set_ids_test - set_ids_train

print("Unique IDs in the test set that are not in the train set:")
print(unique_ids_in_test)

In [None]:
ids_to_remove1 = []

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import numpy as np

# Define flatten function
def flatten(x):
    return x.reshape(x.shape[0], -1)

# Apply flattening
X_train_flattened = flatten(X_train)
X_test_flattened = flatten(X_test)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_flattened)
X_test_scaled = scaler.transform(X_test_flattened)

# Convert to PyTorch tensors
X_train_torch = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_torch = torch.tensor(y_train, dtype=torch.float32)
sample_weight_torch = torch.tensor(sample_weight, dtype=torch.float32)

In [None]:
X_test_torch = torch.tensor(X_test_scaled, dtype=torch.float32)

y_test_torch = torch.tensor(y_test, dtype=torch.float32)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import numpy as np

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size1)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size1, hidden_size2)
        self.layer3 = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        x = self.relu(x)
        x = self.layer3(x)
        return x  # Output raw logits for CrossEntropyLoss
    


In [None]:
mlp_model = MLP(input_size=X_train_scaled.shape[1], hidden_size1=1000, hidden_size2=10, output_size=2)

In [None]:
def accuracy(predicted, labels):
    correct = (predicted == labels).float()  # Convert boolean to float for division
    acc = correct.sum() / len(labels)
    return acc


In [None]:
criterion = nn.CrossEntropyLoss()

optimizer_mlp = torch.optim.Adam(
    params=mlp_model.parameters(),
    lr=0.001,
    betas=(0.9,0.999)
    )

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score

In [None]:
# Initialize lists to store loss, accuracy, and other metrics for each epoch
epoch_losses = []
epoch_accuracies = []
epoch_precisions = []
epoch_recalls = []
epoch_f1s = []
epoch_avg_precisions = []

# this is for the KL divergence matrix
losses_per_patient = {}
patient_ids = np.unique(ids_train)  # Assuming ids_train is a NumPy array
# Initialize the dictionary for each patient ID
for patient_id in patient_ids:
    losses_per_patient[patient_id] = []


num_epochs = 20
for epoch in range(num_epochs):
    mlp_model.train()
    optimizer_mlp.zero_grad()
    outputs = mlp_model(X_train_torch)
    predicted = torch.argmax(outputs, dim=1)
    
    # Calculate training accuracy
    correct = (predicted.squeeze() == y_train_torch).float().sum()
    train_accuracy = correct / len(y_train_torch)
    
    # Calculate loss
    loss = criterion(outputs, y_train_torch.long())
    loss.backward()
    optimizer_mlp.step()

    if epoch == 2:  # Third epoch (zero-indexed)
        model_state = mlp_model.state_dict()
        for i, (input, label) in enumerate(zip(X_train_torch, y_train_torch)):
            output = mlp_model(input.unsqueeze(0))  # Process each sample individually
            loss = criterion(output, label.unsqueeze(0).long())
            losses_per_patient[ids_train[i]].append(loss.item())
    

    # Store metrics for this epoch
    epoch_losses.append(loss.item())
    epoch_accuracies.append(train_accuracy.item() * 100)

    precision = precision_score(y_train_torch.cpu().numpy(), predicted.cpu().numpy())
    recall = recall_score(y_train_torch.cpu().numpy(), predicted.cpu().numpy())
    f1 = f1_score(y_train_torch.cpu().numpy(), predicted.cpu().numpy())
    
    # Assuming outputs are logits from your mlp_model with shape [batch_size, 2]
    softmax = torch.nn.Softmax(dim=1)
    probabilities = softmax(outputs)  # Convert logits to probabilities
    positive_class_probs = probabilities[:, 1]  # Extract probabilities for the positive class

    # Convert tensors to CPU numpy arrays for use with scikit-learn
    y_true_numpy = y_train_torch.cpu().numpy()
    positive_class_probs_numpy = positive_class_probs.cpu().detach().numpy()

    # Calculate average precision score
    average_precision = average_precision_score(y_true_numpy, positive_class_probs_numpy)

    epoch_precisions.append(precision)
    epoch_recalls.append(recall)
    epoch_f1s.append(f1)
    epoch_avg_precisions.append(average_precision)

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Accuracy: {train_accuracy.item() * 100:.2f}%')
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, Average Precision: {average_precision:.4f}')

In [None]:
# Plotting the loss
plt.figure(figsize=(10, 5))
plt.plot(epoch_losses, label='Training Loss')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(epoch_accuracies, label='Training Accuracy')
plt.title('Training Accuracy Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.legend()
plt.show()

In [None]:
# Plotting the loss
plt.figure(figsize=(10, 5))
plt.plot(epoch_losses, label='Training Loss', color='deepskyblue')
plt.xlabel('Epochs')
plt.ylabel('Loss')

# Set x-axis interval to 2
plt.xticks(range(0, len(epoch_losses), 2))

# Move the legend above the plot
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1))

# Add a dashed grid line
plt.grid(True, linestyle='--', color='grey')

# Remove the top and right spines
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.show()

In [None]:
def evaluate_model_at_epoch(mlp_model, X_test_torch, y_test_torch, ids_test, criterion, target_epoch_model_state):
    mlp_model.load_state_dict(target_epoch_model_state)  # Load the saved model state
    mlp_model.eval()  # Set the model to evaluation mode
    
    # Initialize an empty dictionary for test participant losses
    losses_per_patient_test = {}

    with torch.no_grad():
        outputs = mlp_model(X_test_torch)
        for i in range(len(y_test_torch)):
            patient_id = ids_test[i]  # Assuming ids_test contains string elements
            if patient_id not in losses_per_patient_test:
                losses_per_patient_test[patient_id] = []
            individual_loss = criterion(outputs[i].unsqueeze(0), y_test_torch[i].unsqueeze(0).long()).item()
            losses_per_patient_test[patient_id].append(individual_loss)

    # Calculate average loss per test participant
    target_epoch_test_participant_losses = []
    for patient_id in losses_per_patient_test:
        if losses_per_patient_test[patient_id]:  # Avoid division by zero
            avg_loss = sum(losses_per_patient_test[patient_id]) / len(losses_per_patient_test[patient_id])
            target_epoch_test_participant_losses.append((patient_id, avg_loss))

    return target_epoch_test_participant_losses

# Example call to the evaluation function
target_epoch_test_participant_losses = evaluate_model_at_epoch(
    mlp_model, X_test_torch, y_test_torch, ids_test, criterion, model_state
)

# Print the results
print("Test participants' average losses at target epoch:")
for participant_id, avg_loss in target_epoch_test_participant_losses:
    print(f"Participant {participant_id}: Average Loss = {avg_loss:.4f}")


In [None]:
gender_map['gender_encoded'] = gender_map['Gender PwD'].map({'Male': 0, 'Female': 1})


patient_gender_dict = dict(zip(gender_map['patient_id'], gender_map['gender_encoded']))

gender_train = [patient_gender_dict.get(patient_id, 0) for patient_id in ids_train] 

gender_test = [patient_gender_dict.get(patient_id, 0) for patient_id in ids_test] 

gender_train = np.array(gender_train).astype(int)
gender_test = np.array(gender_test).astype(int)

In [None]:
import statistics

average_losses = []
for patient_id, losses in losses_per_patient.items():
    if losses:  # Check if there are any losses recorded to avoid division by zero
        average_loss = sum(losses) / len(losses)
        average_losses.append(average_loss)
    else:
        average_losses.append(0)  # Append 0 or an appropriate value if no losses were recorded



# Assuming average_losses is a list containing all the average losses
mean_average_losses = statistics.mean(average_losses)
std_dev_average_losses = statistics.stdev(average_losses)
mean_plus1_sd = mean_average_losses + std_dev_average_losses

print("Mean of Average Losses:", mean_average_losses)
print("Standard Deviation of Average Losses:", std_dev_average_losses)
print("Mean + 1 SD:", mean_plus1_sd)


# Plotting the histogram of average losses
plt.figure(figsize=(10, 6))
plt.hist(average_losses, bins=15, color='steelblue', alpha=0.75)  # You can adjust the number of bins
plt.title('Histogram of Average Losses at Epoch 3 for All Patients')
plt.xlabel('Average Loss')
plt.ylabel('Number of Patients')
plt.grid(True)

# Adding vertical lines for mean and mean + 1 SD
plt.axvline(mean_average_losses, color='red', linestyle='dashed', linewidth=2)
plt.axvline(mean_average_losses + std_dev_average_losses, color='blue', linestyle='dashed', linewidth=2)

# Adding a legend to identify the lines
plt.legend(['Mean', 'Mean + 1 SD'])

plt.show()

In [None]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

# Assuming average_losses is a list containing all the average losses
average_losses_array = np.array(average_losses).reshape(-1, 1)

# Using the elbow method to find the optimal number of clusters
sum_of_squared_distances = []
K = range(1, 10)
for k in K:
    kmeans = KMeans(n_clusters=k)
    kmeans = kmeans.fit(average_losses_array)
    sum_of_squared_distances.append(kmeans.inertia_)

# Plot the elbow method graph
plt.figure(figsize=(10, 6))
plt.plot(K, sum_of_squared_distances, 'bx-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Sum of Squared Distances')
plt.title('Elbow Method For Optimal k')
plt.grid(True)
plt.show()

In [None]:
# Assuming average_losses_array is defined and populated
# Fit the KMeans model with the optimal number of clusters (assuming it is 3)
optimal_k = 3  # Replace with the actual optimal k determined from the elbow method
kmeans = KMeans(n_clusters=optimal_k)
kmeans.fit(average_losses_array)
labels = kmeans.labels_

# Add the cluster labels to the average_losses_array for visualization
clustered_losses = np.concatenate((average_losses_array, labels.reshape(-1, 1)), axis=1)

# Plotting the histogram with cluster labels
plt.figure(figsize=(10, 6))
colors = ['lightseagreen', 'cornflowerblue', 'slateblue']
for cluster in range(optimal_k):
    cluster_data = clustered_losses[clustered_losses[:, 1] == cluster][:, 0]
    plt.hist(cluster_data, bins=15, alpha=0.75, label=f'Cluster {cluster + 1}', color=colors[cluster])

plt.xlabel('Average Loss')
plt.ylabel('Number of Patients')
plt.grid(True, linestyle='--', linewidth=0.5)

# Adding a legend to identify the clusters
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.10), ncol=3)

# Remove the top and right spines
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.show()

In [None]:
# Count how many patients are in cluster 2
cluster_2_count = np.sum(labels == 2)
print(f"Number of patients in cluster 2: {cluster_2_count}")

In [None]:
new_losses_array = np.array([loss for _, loss in target_epoch_test_participant_losses]).reshape(-1, 1)

# Use the pre-trained KMeans model to predict the cluster labels for the new data
new_labels = kmeans.predict(new_losses_array)

# Combine participant IDs with their corresponding new cluster labels for clarity
new_clustered_results = [(participant_id, avg_loss, cluster_label) 
                         for (participant_id, avg_loss), cluster_label in zip(target_epoch_test_participant_losses, new_labels)]

# Print the clustered results
print("New clustered results (Participant ID, Average Loss, Cluster Label):")
for participant_id, avg_loss, cluster_label in new_clustered_results:
    print(f"Participant {participant_id}: Average Loss = {avg_loss:.4f}, Cluster = {cluster_label}")

In [None]:
print("Gender distribution in training data:", np.unique(gender_train, return_counts=True))
print("Gender distribution in testing data:", np.unique(gender_test, return_counts=True))

In [None]:
import warnings

# Step 1: Convert ids_train and ids_test to sets
set_ids_train = set(ids_train)
set_ids_test = set(ids_test)

# Ensure all ids in test can be found in train
common_ids = set_ids_test.intersection(set_ids_train)

# Print the results
print(f"Number of common IDs between train and test dataset: {len(common_ids)}")
if common_ids:
    print(f"Common IDs: {common_ids}")

# Identify unique IDs in the test set
unique_ids_in_test = set_ids_test - set_ids_train

print("Unique IDs in the test set that are not in the train set:")
print(unique_ids_in_test)

# Create a dictionary for the new clustered results
new_clustered_dict = {participant_id: cluster_label for participant_id, avg_loss, cluster_label in new_clustered_results}

# Generate loss_label_test based on the ID type
loss_label_test = []
for patient_id in ids_test:
    if patient_id in common_ids:
        loss_label_test.append(loss_labels.get(patient_id, -1))
    elif patient_id in unique_ids_in_test:
        loss_label_test.append(new_clustered_dict.get(patient_id, -1))
    else:
        loss_label_test.append(-1)
        warnings.warn(f"Patient ID {patient_id} not found in either training set or new clustered results.")

# Print the first few labels for verification
print("First few loss labels for test set:", loss_label_test[:10])

In [None]:
# Step 1: Prepare dictionaries to hold counts for each cluster
common_ids_count = Counter()
unique_ids_in_test_count = Counter()
unique_ids_in_train_count = Counter()

# Count common IDs
for patient_id in common_ids:
    cluster_label = loss_labels.get(patient_id, -1)
    if cluster_label != -1:
        common_ids_count[cluster_label] += 1

# Count unique IDs in test set
for patient_id in unique_ids_in_test:
    cluster_label = new_clustered_dict.get(patient_id, -1)
    if cluster_label != -1:
        unique_ids_in_test_count[cluster_label] += 1

# Identify unique IDs in the train set
unique_ids_in_train = set_ids_train - set_ids_test

# Count unique IDs in train set
for patient_id in unique_ids_in_train:
    cluster_label = loss_labels.get(patient_id, -1)
    if cluster_label != -1:
        unique_ids_in_train_count[cluster_label] += 1

# Step 2: Prepare data for plotting
id_groups = ['Unique IDs in Train', 'Common IDs', 'Unique IDs in Test']
common_counts = [common_ids_count[cluster] for cluster in range(3)]
unique_test_counts = [unique_ids_in_test_count[cluster] for cluster in range(3)]
unique_train_counts = [unique_ids_in_train_count[cluster] for cluster in range(3)]

counts = [unique_train_counts, common_counts, unique_test_counts]

colors = ['lightseagreen', 'cornflowerblue', 'slateblue']

# Step 3: Plotting
bar_width = 0.25
index = np.arange(len(id_groups))

fig, ax = plt.subplots()
for cluster in range(3):
    counts_for_cluster = [unique_train_counts[cluster], common_counts[cluster], unique_test_counts[cluster]]
    ax.bar(index + cluster * bar_width, counts_for_cluster, bar_width, label=f'Cluster {cluster + 1}', color=colors[cluster])

ax.set_xlabel('ID Groupings')
ax.set_ylabel('Number of Patients')
ax.set_xticks(index + bar_width)
ax.set_xticklabels(id_groups)
ax.grid(True, linestyle='--', linewidth=0.5)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3)

# Remove the top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.show()

In [None]:
# Step 1: Convert ids_train and ids_test to sets
set_ids_train = set(ids_train)
set_ids_test = set(ids_test)

# Ensure all ids in test can be found in train
common_ids = set_ids_test.intersection(set_ids_train)

# Print the results
print(f"Number of common IDs between train and test dataset: {len(common_ids)}")
if common_ids:
    print(f"Common IDs: {common_ids}")

# Generate loss_label_test
loss_label_test = [loss_labels.get(patient_id, -1) for patient_id in ids_test]

# Print the first few labels for verification
print("First few loss labels for test set:", loss_label_test[:10])

In [None]:
print("Loss distribution in training data:", np.unique(loss_label_train, return_counts=True))
print("Loss distribution in test data:", np.unique(loss_label_test, return_counts=True))

In [None]:
from torch.utils.data import Dataset
import pickle

class CustomDataset(Dataset):
    def __init__(self, features, labels,loss_labels, gender, patient_id, sample_weight=None):
        """
        Constructor for the dataset.
        :param features: The input features (numpy array).
        :param labels: The labels corresponding to the features (numpy array).
        :param gender: The gender information (numpy array).
        :param sample_weight: Optional sample weights (numpy array).
        """
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
        self.loss_labels = torch.tensor(loss_labels, dtype=torch.long)
        self.gender = torch.tensor(gender, dtype=torch.long)  # Assuming gender is categorical
        
        self.patient_id = patient_id
        
        if sample_weight is not None:
            self.sample_weight = torch.tensor(sample_weight, dtype=torch.float32)
        else:
            self.sample_weight = None

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        if self.sample_weight is not None:
            return self.features[index], self.labels[index],self.loss_labels[index], self.gender[index], self.sample_weight[index], self.patient_id[index]
        return self.features[index], self.labels[index], self.loss_labels[index], self.patient_id[index]


In [None]:

# Instantiate datasets
train_dataset = CustomDataset(X_train_scaled, y_train, loss_label_train, gender_train, ids_train, sample_weight)
test_dataset = CustomDataset(X_test_scaled, y_test, loss_label_test, gender_test, ids_test, sample_weight)

In [None]:
from torch.utils.data import DataLoader

# Define batch size
batch_size = 128  # You can adjust this based on your system's capabilities

# Create DataLoader for training data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
from collections import Counter

def check_loss_label_distribution(dataset):
    loss_label_counts = Counter()

    # Iterate through the dataset
    for _, _, loss_label, _, _, _ in dataset:
        # Update count of each loss label
        loss_label_counts[loss_label.item()] += 1

    return loss_label_counts

# Check the loss label distribution in the training dataset
train_loss_label_distribution = check_loss_label_distribution(train_dataset)
print("Loss label distribution in training dataset:", train_loss_label_distribution)

# Final layer separated

In [None]:
# Define the model class with dropout
class LossLabelMLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size, dropout_rate):
        super(LossLabelMLP, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size1)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.layer2 = nn.Linear(hidden_size1, hidden_size2)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.layer3_cluster0 = nn.Linear(hidden_size2, output_size)
        self.layer3_cluster1 = nn.Linear(hidden_size2, output_size)
        self.layer3_cluster2 = nn.Linear(hidden_size2, output_size)

    def forward(self, x, loss_labels):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout1(x)
        x = self.layer2(x)
        x = self.relu(x)
        x = self.dropout2(x)

        outputs = []
        for x_i, loss_label_i in zip(x, loss_labels):
            if loss_label_i == 0:
                layer = self.layer3_cluster0
            elif loss_label_i == 1:
                layer = self.layer3_cluster1
            elif loss_label_i == 2:
                layer = self.layer3_cluster2
            else:
                raise ValueError(f"Unexpected loss_label: {loss_label_i}")
            outputs.append(layer(x_i))

        return torch.stack(outputs)


In [None]:
# Define the training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for features, labels, loss_labels, gender, patient_id, sample_weight in train_loader:
            optimizer.zero_grad()
            outputs = model(features, loss_labels)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        val_loss = running_loss / len(train_loader)
        
        model.eval()
        val_running_loss = 0.0
        all_labels = []
        all_preds = []
        all_genders = []
        with torch.no_grad():
            for features, labels, loss_labels, gender, patient_id, sample_weight in val_loader:
                outputs = model(features, loss_labels)
                loss = criterion(outputs, labels)
                val_running_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                all_labels.extend(labels.cpu().numpy())
                all_preds.extend(predicted.cpu().numpy())
                all_genders.extend(gender.cpu().numpy())
        
        val_loss = val_running_loss / len(val_loader)
        all_labels = np.array(all_labels)
        all_preds = np.array(all_preds)
        all_genders = np.array(all_genders)

        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds, average='binary', zero_division=0)
        sensitivity = recall_score(all_labels, all_preds, average='binary', zero_division=0)

        male_indices = all_genders == 0
        female_indices = all_genders == 1

        male_accuracy = accuracy_score(all_labels[male_indices], all_preds[male_indices])
        male_precision = precision_score(all_labels[male_indices], all_preds[male_indices], average='binary', zero_division=0)
        male_sensitivity = recall_score(all_labels[male_indices], all_preds[male_indices], average='binary', zero_division=0)

        female_accuracy = accuracy_score(all_labels[female_indices], all_preds[female_indices])
        female_precision = precision_score(all_labels[female_indices], all_preds[female_indices], average='binary', zero_division=0)
        female_sensitivity = recall_score(all_labels[female_indices], all_preds[female_indices], average='binary', zero_division=0)

        print(f'Epoch {epoch+1}/{num_epochs}, '
              f'Val Loss: {val_loss:.4f}, '
              f'Overall - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Sensitivity: {sensitivity:.4f}, '
              f'Male - Accuracy: {male_accuracy:.4f}, Precision: {male_precision:.4f}, Sensitivity: {male_sensitivity:.4f}, '
              f'Female - Accuracy: {female_accuracy:.4f}, Precision: {female_precision:.4f}, Sensitivity: {female_sensitivity:.4f}')
    
    return val_loss, accuracy, precision, sensitivity, male_accuracy, male_precision, male_sensitivity, female_accuracy, female_precision, female_sensitivity

In [None]:
def cross_validate_and_tune(train_dataset, num_unique_ids, param_grid, num_folds=10, num_epochs=10):
    best_hyperparams = None
    best_val_loss = float('inf')
    best_metrics = None

    skf = StratifiedKFold(n_splits=num_folds)
    
    for params in ParameterGrid(param_grid):
        print(f"Testing hyperparameters: {params}")
        
        fold_metrics = {
            'accuracy': [], 'precision': [], 'sensitivity': [],
            'male_accuracy': [], 'male_precision': [], 'male_sensitivity': [],
            'female_accuracy': [], 'female_precision': [], 'female_sensitivity': []
        }
        
        for fold, (train_index, val_index) in enumerate(skf.split(X_train_scaled, y_train)):
            print(f'Fold {fold+1}/{num_folds} with params: {params}')
            
            train_subset = Subset(train_dataset, train_index)
            val_subset = Subset(train_dataset, val_index)
            train_loader = DataLoader(train_subset, batch_size=128, shuffle=True)
            val_loader = DataLoader(val_subset, batch_size=128, shuffle=False)
            
            model = LossLabelMLP(input_size=X_train_scaled.shape[1], hidden_size1=30, hidden_size2=10, output_size=2, dropout_rate=params['dropout_rate'])
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=params['lr'])
            
            val_loss, accuracy, precision, sensitivity, male_accuracy, male_precision, male_sensitivity, female_accuracy, female_precision, female_sensitivity = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs)
            
            fold_metrics['accuracy'].append(accuracy)
            fold_metrics['precision'].append(precision)
            fold_metrics['sensitivity'].append(sensitivity)
            fold_metrics['male_accuracy'].append(male_accuracy)
            fold_metrics['male_precision'].append(male_precision)
            fold_metrics['male_sensitivity'].append(male_sensitivity)
            fold_metrics['female_accuracy'].append(female_accuracy)
            fold_metrics['female_precision'].append(female_precision)
            fold_metrics['female_sensitivity'].append(female_sensitivity)
        
        avg_val_loss = np.mean([val_loss])
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_hyperparams = params
            best_metrics = fold_metrics

        # Print the metrics for each combination of hyperparameters at the end of 10 folds
        print(f'Hyperparameters: {params}')
        for metric in fold_metrics:
            print(f'{metric}: Mean = {np.mean(fold_metrics[metric]):.4f}, Std = {np.std(fold_metrics[metric]):.4f}')

    # Print the best hyperparameters and corresponding metrics
    print("Best Hyperparameters:", best_hyperparams)
    for metric in best_metrics:
        print(f'{metric}: Mean = {np.mean(best_metrics[metric]):.4f}, Std = {np.std(best_metrics[metric]):.4f}')

    return best_hyperparams

# Perform cross-validation and find the best hyperparameters
param_grid = {'lr': [0.001, 0.005, 0.01], 'dropout_rate': [0, 0.2, 0.5]}
num_unique_ids = len(ids_train)

best_hyperparams = cross_validate_and_tune(train_dataset, num_unique_ids, param_grid, num_folds=10, num_epochs=10)

In [None]:
# Function to perform bootstrap sampling
def bootstrap_sample(dataset, patient_ids, proportion=0.8):
    sampled_indices = []
    for pid in np.unique(patient_ids):
        pid_indices = np.where(patient_ids == pid)[0]
        sample_size = int(proportion * len(pid_indices))
        sampled_pid_indices = np.random.choice(pid_indices, size=sample_size, replace=True)
        sampled_indices.extend(sampled_pid_indices)
    return Subset(dataset, sampled_indices)

# Function to evaluate the model
def evaluate_model(model, dataloader, criterion):
    model.eval()
    all_labels = []
    all_preds = []
    all_genders = []
    with torch.no_grad():
        for features, labels, loss_labels, gender, patient_id, sample_weight in dataloader:
            outputs = model(features, loss_labels)
            _, preds = torch.max(outputs, 1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_genders.extend(gender.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='binary', zero_division=0)
    sensitivity = recall_score(all_labels, all_preds, average='binary', zero_division=0)

    male_indices = np.array(all_genders) == 0
    female_indices = np.array(all_genders) == 1

    male_accuracy = accuracy_score(np.array(all_labels)[male_indices], np.array(all_preds)[male_indices])
    male_precision = precision_score(np.array(all_labels)[male_indices], np.array(all_preds)[male_indices], average='binary', zero_division=0)
    male_sensitivity = recall_score(np.array(all_labels)[male_indices], np.array(all_preds)[male_indices], average='binary', zero_division=0)

    female_accuracy = accuracy_score(np.array(all_labels)[female_indices], np.array(all_preds)[female_indices])
    female_precision = precision_score(np.array(all_labels)[female_indices], np.array(all_preds)[female_indices], average='binary', zero_division=0)
    female_sensitivity = recall_score(np.array(all_labels)[female_indices], np.array(all_preds)[female_indices], average='binary', zero_division=0)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'sensitivity': sensitivity,
        'male_accuracy': male_accuracy,
        'male_precision': male_precision,
        'male_sensitivity': male_sensitivity,
        'female_accuracy': female_accuracy,
        'female_precision': female_precision,
        'female_sensitivity': female_sensitivity
    }

# Bootstrap sampling and training the best model
num_bootstrap_samples = 5
bootstrap_results = {
    'accuracy': [], 'precision': [], 'sensitivity': [],
    'male_accuracy': [], 'male_precision': [], 'male_sensitivity': [],
    'female_accuracy': [], 'female_precision': [], 'female_sensitivity': []
}

for _ in range(num_bootstrap_samples):
    # Bootstrap sampling 80% of data points for each patient
    bootstrap_subset = bootstrap_sample(train_dataset, ids_train, proportion=0.8)
    bootstrap_loader = DataLoader(bootstrap_subset, batch_size=128, shuffle=True)
    
    model = LossLabelMLP(input_size=X_train_scaled.shape[1], hidden_size1=30, hidden_size2=10, output_size=2, dropout_rate=best_hyperparams['dropout_rate'])
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=best_hyperparams['lr'])
    
    # Train with the best hyperparameters
    train_model(model, bootstrap_loader, DataLoader(test_dataset, batch_size=128, shuffle=False), criterion, optimizer)
    
    # Evaluate on the full test dataset
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)
    results = evaluate_model(model, test_loader, criterion)
    
    bootstrap_results['accuracy'].append(results['accuracy'])
    bootstrap_results['precision'].append(results['precision'])
    bootstrap_results['sensitivity'].append(results['sensitivity'])
    bootstrap_results['male_accuracy'].append(results['male_accuracy'])
    bootstrap_results['male_precision'].append(results['male_precision'])
    bootstrap_results['male_sensitivity'].append(results['male_sensitivity'])
    bootstrap_results['female_accuracy'].append(results['female_accuracy'])
    bootstrap_results['female_precision'].append(results['female_precision'])
    bootstrap_results['female_sensitivity'].append(results['female_sensitivity'])

# Print bootstrap results
for metric in bootstrap_results:
    print(f'{metric.capitalize()}: Mean = {np.mean(bootstrap_results[metric]):.4f}, Std = {np.std(bootstrap_results[metric]):.4f}')

# fully separated

In [None]:
class LossLabelMLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size, dropout_rate):
        super(LossLabelMLP, self).__init__()
        
        # Define layers for cluster 0
        self.layer1_cluster0 = nn.Linear(input_size, hidden_size1)
        self.relu = nn.ReLU()
        self.dropout1_cluster0 = nn.Dropout(dropout_rate)
        self.layer2_cluster0 = nn.Linear(hidden_size1, hidden_size2)
        self.dropout2_cluster0 = nn.Dropout(dropout_rate)
        self.layer3_cluster0 = nn.Linear(hidden_size2, output_size)
        
        # Define layers for cluster 1
        self.layer1_cluster1 = nn.Linear(input_size, hidden_size1)
        self.dropout1_cluster1 = nn.Dropout(dropout_rate)
        self.layer2_cluster1 = nn.Linear(hidden_size1, hidden_size2)
        self.dropout2_cluster1 = nn.Dropout(dropout_rate)
        self.layer3_cluster1 = nn.Linear(hidden_size2, output_size)
        
        # Define layers for cluster 2
        self.layer1_cluster2 = nn.Linear(input_size, hidden_size1)
        self.dropout1_cluster2 = nn.Dropout(dropout_rate)
        self.layer2_cluster2 = nn.Linear(hidden_size1, hidden_size2)
        self.dropout2_cluster2 = nn.Dropout(dropout_rate)
        self.layer3_cluster2 = nn.Linear(hidden_size2, output_size)

    def forward(self, x, loss_labels):
        outputs = []
        for x_i, loss_label_i in zip(x, loss_labels):
            if loss_label_i == 0:
                x_i = self.layer1_cluster0(x_i)
                x_i = self.relu(x_i)
                x_i = self.dropout1_cluster0(x_i)
                x_i = self.layer2_cluster0(x_i)
                x_i = self.relu(x_i)
                x_i = self.dropout2_cluster0(x_i)
                x_i = self.layer3_cluster0(x_i)
            elif loss_label_i == 1:
                x_i = self.layer1_cluster1(x_i)
                x_i = self.relu(x_i)
                x_i = self.dropout1_cluster1(x_i)
                x_i = self.layer2_cluster1(x_i)
                x_i = self.relu(x_i)
                x_i = self.dropout2_cluster1(x_i)
                x_i = self.layer3_cluster1(x_i)
            elif loss_label_i == 2:
                x_i = self.layer1_cluster2(x_i)
                x_i = self.relu(x_i)
                x_i = self.dropout1_cluster2(x_i)
                x_i = self.layer2_cluster2(x_i)
                x_i = self.relu(x_i)
                x_i = self.dropout2_cluster2(x_i)
                x_i = self.layer3_cluster2(x_i)
            else:
                raise ValueError(f"Unexpected loss_label: {loss_label_i}")
            outputs.append(x_i)
        return torch.stack(outputs)

In [None]:
# Define the training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for features, labels, loss_labels, gender, patient_id, sample_weight in train_loader:
            optimizer.zero_grad()
            outputs = model(features, loss_labels)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        val_loss = running_loss / len(train_loader)
        
        model.eval()
        val_running_loss = 0.0
        all_labels = []
        all_preds = []
        all_genders = []
        with torch.no_grad():
            for features, labels, loss_labels, gender, patient_id, sample_weight in val_loader:
                outputs = model(features, loss_labels)
                loss = criterion(outputs, labels)
                val_running_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                all_labels.extend(labels.cpu().numpy())
                all_preds.extend(predicted.cpu().numpy())
                all_genders.extend(gender.cpu().numpy())
        
        val_loss = val_running_loss / len(val_loader)
        all_labels = np.array(all_labels)
        all_preds = np.array(all_preds)
        all_genders = np.array(all_genders)

        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds, average='binary', zero_division=0)
        sensitivity = recall_score(all_labels, all_preds, average='binary', zero_division=0)

        male_indices = all_genders == 0
        female_indices = all_genders == 1

        male_accuracy = accuracy_score(all_labels[male_indices], all_preds[male_indices])
        male_precision = precision_score(all_labels[male_indices], all_preds[male_indices], average='binary', zero_division=0)
        male_sensitivity = recall_score(all_labels[male_indices], all_preds[male_indices], average='binary', zero_division=0)

        female_accuracy = accuracy_score(all_labels[female_indices], all_preds[female_indices])
        female_precision = precision_score(all_labels[female_indices], all_preds[female_indices], average='binary', zero_division=0)
        female_sensitivity = recall_score(all_labels[female_indices], all_preds[female_indices], average='binary', zero_division=0)

        print(f'Epoch {epoch+1}/{num_epochs}, '
              f'Val Loss: {val_loss:.4f}, '
              f'Overall - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Sensitivity: {sensitivity:.4f}, '
              f'Male - Accuracy: {male_accuracy:.4f}, Precision: {male_precision:.4f}, Sensitivity: {male_sensitivity:.4f}, '
              f'Female - Accuracy: {female_accuracy:.4f}, Precision: {female_precision:.4f}, Sensitivity: {female_sensitivity:.4f}')
    
    return val_loss, accuracy, precision, sensitivity, male_accuracy, male_precision, male_sensitivity, female_accuracy, female_precision, female_sensitivity

In [None]:
def cross_validate_and_tune(train_dataset, num_unique_ids, param_grid, num_folds=10, num_epochs=10):
    best_hyperparams = None
    best_val_loss = float('inf')
    best_metrics = None

    skf = StratifiedKFold(n_splits=num_folds)
    
    for params in ParameterGrid(param_grid):
        print(f"Testing hyperparameters: {params}")
        
        fold_metrics = {
            'accuracy': [], 'precision': [], 'sensitivity': [],
            'male_accuracy': [], 'male_precision': [], 'male_sensitivity': [],
            'female_accuracy': [], 'female_precision': [], 'female_sensitivity': []
        }
        
        for fold, (train_index, val_index) in enumerate(skf.split(X_train_scaled, y_train)):
            print(f'Fold {fold+1}/{num_folds} with params: {params}')
            
            train_subset = Subset(train_dataset, train_index)
            val_subset = Subset(train_dataset, val_index)
            train_loader = DataLoader(train_subset, batch_size=128, shuffle=True)
            val_loader = DataLoader(val_subset, batch_size=128, shuffle=False)
            
            model = LossLabelMLP(input_size=X_train_scaled.shape[1], hidden_size1=30, hidden_size2=10, output_size=2, dropout_rate=params['dropout_rate'])
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=params['lr'])
            
            val_loss, accuracy, precision, sensitivity, male_accuracy, male_precision, male_sensitivity, female_accuracy, female_precision, female_sensitivity = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs)
            
            fold_metrics['accuracy'].append(accuracy)
            fold_metrics['precision'].append(precision)
            fold_metrics['sensitivity'].append(sensitivity)
            fold_metrics['male_accuracy'].append(male_accuracy)
            fold_metrics['male_precision'].append(male_precision)
            fold_metrics['male_sensitivity'].append(male_sensitivity)
            fold_metrics['female_accuracy'].append(female_accuracy)
            fold_metrics['female_precision'].append(female_precision)
            fold_metrics['female_sensitivity'].append(female_sensitivity)
        
        avg_val_loss = np.mean([val_loss])
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_hyperparams = params
            best_metrics = fold_metrics

        # Print the metrics for each combination of hyperparameters at the end of 10 folds
        print(f'Hyperparameters: {params}')
        for metric in fold_metrics:
            print(f'{metric}: Mean = {np.mean(fold_metrics[metric]):.4f}, Std = {np.std(fold_metrics[metric]):.4f}')

    # Print the best hyperparameters and corresponding metrics
    print("Best Hyperparameters:", best_hyperparams)
    for metric in best_metrics:
        print(f'{metric}: Mean = {np.mean(best_metrics[metric]):.4f}, Std = {np.std(best_metrics[metric]):.4f}')

    return best_hyperparams

# Perform cross-validation and find the best hyperparameters
param_grid = {'lr': [0.001, 0.005, 0.01], 'dropout_rate': [0, 0.2, 0.5]}
num_unique_ids = len(ids_train)

best_hyperparams = cross_validate_and_tune(train_dataset, num_unique_ids, param_grid, num_folds=10, num_epochs=10)

In [None]:
# Function to perform bootstrap sampling
def bootstrap_sample(dataset, patient_ids, proportion=0.8):
    sampled_indices = []
    for pid in np.unique(patient_ids):
        pid_indices = np.where(patient_ids == pid)[0]
        sample_size = int(proportion * len(pid_indices))
        sampled_pid_indices = np.random.choice(pid_indices, size=sample_size, replace=True)
        sampled_indices.extend(sampled_pid_indices)
    return Subset(dataset, sampled_indices)

# Function to evaluate the model
def evaluate_model(model, dataloader, criterion):
    model.eval()
    all_labels = []
    all_preds = []
    all_genders = []
    with torch.no_grad():
        for features, labels, loss_labels, gender, patient_id, sample_weight in dataloader:
            outputs = model(features, loss_labels)
            _, preds = torch.max(outputs, 1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_genders.extend(gender.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='binary', zero_division=0)
    sensitivity = recall_score(all_labels, all_preds, average='binary', zero_division=0)

    male_indices = np.array(all_genders) == 0
    female_indices = np.array(all_genders) == 1

    male_accuracy = accuracy_score(np.array(all_labels)[male_indices], np.array(all_preds)[male_indices])
    male_precision = precision_score(np.array(all_labels)[male_indices], np.array(all_preds)[male_indices], average='binary', zero_division=0)
    male_sensitivity = recall_score(np.array(all_labels)[male_indices], np.array(all_preds)[male_indices], average='binary', zero_division=0)

    female_accuracy = accuracy_score(np.array(all_labels)[female_indices], np.array(all_preds)[female_indices])
    female_precision = precision_score(np.array(all_labels)[female_indices], np.array(all_preds)[female_indices], average='binary', zero_division=0)
    female_sensitivity = recall_score(np.array(all_labels)[female_indices], np.array(all_preds)[female_indices], average='binary', zero_division=0)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'sensitivity': sensitivity,
        'male_accuracy': male_accuracy,
        'male_precision': male_precision,
        'male_sensitivity': male_sensitivity,
        'female_accuracy': female_accuracy,
        'female_precision': female_precision,
        'female_sensitivity': female_sensitivity
    }

# Bootstrap sampling and training the best model
num_bootstrap_samples = 5
bootstrap_results = {
    'accuracy': [], 'precision': [], 'sensitivity': [],
    'male_accuracy': [], 'male_precision': [], 'male_sensitivity': [],
    'female_accuracy': [], 'female_precision': [], 'female_sensitivity': []
}

for _ in range(num_bootstrap_samples):
    # Bootstrap sampling 80% of data points for each patient
    bootstrap_subset = bootstrap_sample(train_dataset, ids_train, proportion=0.8)
    bootstrap_loader = DataLoader(bootstrap_subset, batch_size=128, shuffle=True)
    
    model = LossLabelMLP(input_size=X_train_scaled.shape[1], hidden_size1=30, hidden_size2=10, output_size=2, dropout_rate=best_hyperparams['dropout_rate'])
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=best_hyperparams['lr'])
    
    # Train with the best hyperparameters
    train_model(model, bootstrap_loader, DataLoader(test_dataset, batch_size=128, shuffle=False), criterion, optimizer)
    
    # Evaluate on the full test dataset
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)
    results = evaluate_model(model, test_loader, criterion)
    
    bootstrap_results['accuracy'].append(results['accuracy'])
    bootstrap_results['precision'].append(results['precision'])
    bootstrap_results['sensitivity'].append(results['sensitivity'])
    bootstrap_results['male_accuracy'].append(results['male_accuracy'])
    bootstrap_results['male_precision'].append(results['male_precision'])
    bootstrap_results['male_sensitivity'].append(results['male_sensitivity'])
    bootstrap_results['female_accuracy'].append(results['female_accuracy'])
    bootstrap_results['female_precision'].append(results['female_precision'])
    bootstrap_results['female_sensitivity'].append(results['female_sensitivity'])

# Print bootstrap results
for metric in bootstrap_results:
    print(f'{metric.capitalize()}: Mean = {np.mean(bootstrap_results[metric]):.4f}, Std = {np.std(bootstrap_results[metric]):.4f}')