In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

df_encoded = pd.read_pickle("df_encoded.pkl")

# Drop 'ID' column if it exists
id_column = "ID" if "ID" in df_encoded.columns else None
if id_column:
    df_encoded = df_encoded.drop(columns=[id_column])
    
"""Remove these columns:
CREATININE 
BMC_GLUCOSE 
BMC_ALT(SGPT) 
BMC_AST(SGOT)
"""

columns_to_remove = ['min_BMC_ALT(SGPT)', 'min_BMC_AST(SGOT)', 'min_BMC_GLUCOSE', 
    'min_CREATININE', 'max_BMC_ALT(SGPT)', 'max_BMC_AST(SGOT)', 
    'max_BMC_GLUCOSE', 'max_CREATININE', 'mean_BMC_ALT(SGPT)', 
    'mean_BMC_AST(SGOT)', 'mean_BMC_GLUCOSE', 'mean_CREATININE']

df_encoded = df_encoded.drop(columns=columns_to_remove, errors="ignore")
print("Columns removed:", columns_to_remove)


df_complete = df_encoded.copy()

# Remove rows where any of the specified variables have a 1
variables_to_check_1 = [
    'PRIMARY_RACE_Unknown', 
    'LANGUAGE_Unknown',
    'PRIMARY_ETHNICITY_Unknown',
    'D_Insur_at_pull_Unknown'
]

df_complete = df_complete[~df_complete[variables_to_check_1].eq(1).any(axis=1)]



variables_to_check_0 = [
    
    'min_BMI', 'min_HEIGHT', 'min_PULSE', 'min_WEIGHT', 
    'max_BMI', 'max_HEIGHT', 'max_PULSE', 'max_WEIGHT', 
    'mean_BMI', 'mean_HEIGHT', 'mean_PULSE', 'mean_WEIGHT',
    'SYSTOLIC_BP_min', 'SYSTOLIC_BP_max', 'SYSTOLIC_BP_mean',
    'DIASTOLIC_BP_min', 'DIASTOLIC_BP_max', 'DIASTOLIC_BP_mean'
]

df_complete=  df_complete[~df_complete[variables_to_check_0].eq(0).any(axis=1)]

df_complete = df_complete.drop(columns=variables_to_check_1, errors='ignore')
df_complete = df_complete.astype({col: int for col in df_complete.select_dtypes(include=['bool']).columns})
print(df_complete.dtypes)


scaler = MinMaxScaler()

# Normalize all features
df_complete = pd.DataFrame(scaler.fit_transform(df_complete), columns=df_complete.columns)

print(f"Number of rows in df_encoded: {len(df_encoded)}")
print(f"Number of rows in df_complete: {len(df_complete)}")


Columns removed: ['min_BMC_ALT(SGPT)', 'min_BMC_AST(SGOT)', 'min_BMC_GLUCOSE', 'min_CREATININE', 'max_BMC_ALT(SGPT)', 'max_BMC_AST(SGOT)', 'max_BMC_GLUCOSE', 'max_CREATININE', 'mean_BMC_ALT(SGPT)', 'mean_BMC_AST(SGOT)', 'mean_BMC_GLUCOSE', 'mean_CREATININE']
demo_age        int64
RPL_THEME1    float64
GENDER_F        int64
GENDER_M        int64
GENDER_U        int64
               ...   
F41.8           int64
F41.0           int64
F33.2           int64
F20.89          int64
F34.1           int64
Length: 163, dtype: object
Number of rows in df_encoded: 5664
Number of rows in df_complete: 4205


In [None]:
continuous_columns = []
binary_columns = []

# Iterate over each column in the DataFrame
for column in df_complete.columns:
    unique_values = df_complete[column].dropna().unique()
    
    if len(unique_values) == 2:
        binary_columns.append(column)
    elif pd.api.types.is_numeric_dtype(df_complete[column]) and len(unique_values) > 2:
        continuous_columns.append(column)

print("Continuous Columns:", continuous_columns)
print("Binary Columns:", binary_columns)

binary_indices = [df_complete.columns.get_loc(col) for col in binary_columns]
continuous_indices = [df_complete.columns.get_loc(col) for col in continuous_columns]


Continuous Columns: ['demo_age', 'RPL_THEME1', 'min_CLOZAPINE', 'min_OLANZAPINE', 'min_RISPERIDONE', 'max_CLOZAPINE', 'max_OLANZAPINE', 'max_RISPERIDONE', 'mean_CLOZAPINE', 'mean_OLANZAPINE', 'mean_RISPERIDONE', 'min_BMI', 'min_HEIGHT', 'min_PULSE', 'min_WEIGHT', 'max_BMI', 'max_HEIGHT', 'max_PULSE', 'max_WEIGHT', 'mean_BMI', 'mean_HEIGHT', 'mean_PULSE', 'mean_WEIGHT', 'SYSTOLIC_BP_min', 'SYSTOLIC_BP_max', 'SYSTOLIC_BP_mean', 'DIASTOLIC_BP_min', 'DIASTOLIC_BP_max', 'DIASTOLIC_BP_mean']
Binary Columns: ['GENDER_F', 'GENDER_M', 'GENDER_U', 'PRIMARY_RACE_American Indian / Native American', 'PRIMARY_RACE_Asian', 'PRIMARY_RACE_Asian Indian', 'PRIMARY_RACE_Black / African American', 'PRIMARY_RACE_Hispanic or Latino', 'PRIMARY_RACE_Middle Eastern', 'PRIMARY_RACE_Native Hawaiian / Pacific Islander', 'PRIMARY_RACE_Other', 'PRIMARY_RACE_White', 'LANGUAGE_Afrikaans', 'LANGUAGE_Albanian', 'LANGUAGE_American Sign Language', 'LANGUAGE_American Sign language & Certified Deaf Interpreter', 'LANGUAGE_A

In [21]:
# Calculate and display variance of each feature in df_encoded
feature_variance = df_complete.var()
high_variance_features = {feature: var for feature, var in feature_variance.items() if var < 0.01}
high_variance_features_list = list(high_variance_features.keys())

# print(high_variance_features_list)
# Function to print entire dataframe
def print_full_dataframe(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(df)
    

# Example usage
print_full_dataframe(high_variance_features)

print(f'Number of all features: {len(df_complete.columns.tolist())}')
print(f'Number of all features with variance > 0.01: {len(high_variance_features_list)}')



{'GENDER_U': 0.00023781212841854937, 'PRIMARY_RACE_American Indian / Native American': 0.005205732381028882, 'PRIMARY_RACE_Asian Indian': 0.004027419670524987, 'PRIMARY_RACE_Middle Eastern': 0.006146346099236216, 'PRIMARY_RACE_Native Hawaiian / Pacific Islander': 0.0007130969768896847, 'LANGUAGE_Afrikaans': 0.00023781212841854937, 'LANGUAGE_Albanian': 0.0016623090403680998, 'LANGUAGE_American Sign Language': 0.0004755111207151106, 'LANGUAGE_American Sign language & Certified Deaf Interpreter': 0.0004755111207151106, 'LANGUAGE_Amharic / Ethiopia': 0.0016623090403680995, 'LANGUAGE_Arabic': 0.0014251757286814775, 'LANGUAGE_Bassa / Liberia': 0.00023781212841854932, 'LANGUAGE_Bengali / Hindi / Urdu': 0.0, 'LANGUAGE_Bosnian / Croatian / Yulo': 0.0, 'LANGUAGE_Brazilian Portuguese': 0.0007130969768896846, 'LANGUAGE_Chinese / Cantonese': 0.0007130969768896847, 'LANGUAGE_Chinese / Mandarin': 0.0004755111207151106, 'LANGUAGE_French': 0.00047551112071511056, 'LANGUAGE_Fulani / Cameroon': 0.0002378

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from pyampute.ampute import MultivariateAmputation
from sklearn.metrics import mean_squared_error, log_loss
from tqdm import tqdm


import warnings
warnings.filterwarnings("ignore")

# Ensure reproducibility
np.random.seed(42)
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # Check if using GPU

missingness_levels = [0.1, 0.2, 0.3, 0.4, 0.5]
n_repeats = 5
n_splits = 5  

# Store results
results = []

# ðŸ”¹ Define NAA Model
class NAAutoencoder(nn.Module):
    def __init__(self, input_dim):
        super(NAAutoencoder, self).__init__()

        hidden_dim1 = int(input_dim * 1.5)  
        hidden_dim2 = int(input_dim * 2)    
        hidden_dim3 = int(input_dim * 2.5) 

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim1),
            nn.Sigmoid(),
            nn.Linear(hidden_dim1, hidden_dim2),
            nn.ReLU(),
            nn.Linear(hidden_dim2, hidden_dim3),
            nn.ReLU()
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim3, hidden_dim2),
            nn.ReLU(),
            nn.Linear(hidden_dim2, hidden_dim1),
            nn.ReLU(),
            nn.Linear(hidden_dim1, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()

    def forward(self, outputs, targets):
        return torch.sqrt(torch.mean((outputs - targets) ** 2))  # RMSE

def train_naa(train_data, test_data, epochs=50, batch_size=32, learning_rate=0.001):
    knn_imputer = KNNImputer(n_neighbors=5)  # Pre-imputation step
    train_data_knn = knn_imputer.fit_transform(train_data)
    test_data_knn = knn_imputer.transform(test_data)

    train_tensor = torch.tensor(train_data_knn, dtype=torch.float32).to(device)
    test_tensor = torch.tensor(test_data_knn, dtype=torch.float32).to(device)

    train_loader = DataLoader(TensorDataset(train_tensor), batch_size=batch_size, shuffle=True)

    input_dim = train_data.shape[1]
    model = NAAutoencoder(input_dim).to(device)  
    criterion = RMSELoss().to(device)  
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    model.train()
    for epoch in range(epochs):
        for batch in train_loader:
            batch_data = batch[0].to(device)  
            optimizer.zero_grad()
            output = model(batch_data)
            loss = criterion(output, batch_data)
            loss.backward()
            optimizer.step()


    model.eval()
    with torch.no_grad():
        reconstructed_test_data = model(test_tensor).cpu().numpy()  

    missing_mask = np.isnan(test_data)  

    # Replace only missing values with imputed values while keeping original non-missing values
    imputed_test_data = test_data.copy()
    imputed_test_data[missing_mask] = reconstructed_test_data[missing_mask]

    return imputed_test_data

def missforest_imputer():
    return IterativeImputer(
        estimator=RandomForestRegressor(n_estimators=10, random_state=42),
        max_iter=10,
        random_state=42
    )


for missing_prob in missingness_levels:
    results = {"MissForest": []}

    results_per_n = []
    for _ in range(n_repeats):
        amputer = MultivariateAmputation(patterns=[{"mechanism": "MCAR", "prob": missing_prob, "incomplete_vars": high_variance_features_list}])
        df_missing = amputer.fit_transform(df_complete)

        id_column = "ID" if "ID" in df_missing.columns else None
        df_features = df_missing.drop(columns=[id_column]) if id_column else df_missing

        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        
        for train_index, test_index in kf.split(df_features):
            train_df, test_df = df_features.iloc[train_index], df_features.iloc[test_index]
            train_complete, test_complete = df_complete.iloc[train_index], df_complete.iloc[test_index]

            data_range = test_complete.max() - test_complete.min()
            data_range[data_range == 0] = 1  # Avoid division by zero

            imputers = {
                "MissForest": missforest_imputer()
            }

            rmse_results = {imputer_name: [] for imputer_name in imputers.keys()}
            bce_results = {imputer_name: [] for imputer_name in imputers.keys()}
            combined_results = {imputer_name: [] for imputer_name in imputers.keys()}



            for imputer_name, imputer in imputers.items():
                imputed_train = pd.DataFrame(imputer.fit_transform(train_df), columns=train_df.columns)
                imputed_test = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)

                # Convert to NumPy arrays
                test_complete_np = test_complete.to_numpy()
                imputed_test_np = imputed_test.to_numpy()

                missing_mask = test_df.isnull().to_numpy()

                # Evaluate RMSE for Continuous Variables
                continuous_rmse = []
                for col in continuous_columns:
                    col_index = train_df.columns.get_loc(col)
                    col_mask = missing_mask[:, col_index]

                    # Skip if no imputed values for this column
                    if np.sum(col_mask) == 0:
                        continue
                    rmse = mean_squared_error(
                        test_complete_np[col_mask, col_index],
                        imputed_test_np[col_mask, col_index],
                        squared=False
                    )
                    continuous_rmse.append(rmse)
                
                # Average RMSE across all continuous columns
                avg_rmse = np.mean(continuous_rmse)
                rmse_results[imputer_name].append(avg_rmse)
                
                # Evaluate BCE for binary variables
                binary_bce = []
                for col in binary_columns:
                    col_index = train_df.columns.get_loc(col)
                    col_mask = missing_mask[:, col_index]

                    if np.sum(col_mask) == 0:
                        continue
                    
                    pred_probs = np.clip(imputed_test_np[col_mask, col_index], 1e-10, 1 - 1e-10)

                    # Checks if there are more than one unique value 
                    if len(np.unique(test_complete_np[col_mask, col_index])) > 1:
                        bce = log_loss(
                            test_complete_np[col_mask, col_index],
                            pred_probs
                        )
                        binary_bce.append(bce)
                    
                # Average BCE across all binary columns
                avg_bce = np.mean(binary_bce)
                bce_results[imputer_name].append(avg_bce)

                # Calculate Combined Metric
                combined_metric = avg_rmse + avg_bce
                combined_results[imputer_name].append(combined_metric)
        
        for method in rmse_results:
            mean_rmse_per_k_fold = np.mean(rmse_results[method])
            std_rmse_per_k_fold = np.std(rmse_results[method])
            
            mean_bce_per_k_fold = np.mean(bce_results[method])
            std_bce_per_k_fold = np.std(bce_results[method])
            
            mean_combined_per_k_fold = np.mean(combined_results[method])
            std_combined_per_k_fold = np.std(combined_results[method])
            
            results_per_n.append({
                "Missingness": missing_prob,
                "Method": method,
                "Mean RMSE": mean_rmse_per_k_fold,
                "Std RMSE": std_rmse_per_k_fold,
                "Mean BCE": mean_bce_per_k_fold,
                "Std BCE": std_bce_per_k_fold,
                "Mean Combined": mean_combined_per_k_fold,
                "Std Combined": std_combined_per_k_fold
            })

    df_results = pd.DataFrame(results_per_n)

    averaged_results = df_results.groupby(["Missingness", "Method"]).agg({
        "Mean RMSE": ["mean", "std"],
        "Mean BCE": ["mean", "std"],
        "Mean Combined": ["mean", "std"]
    }).reset_index()

    # Calculate Error Ranges
    averaged_results['Error Range RMSE'] = 1.96 * (averaged_results[('Mean RMSE', 'std')] / np.sqrt(n_repeats))
    averaged_results['Error Range BCE'] = 1.96 * (averaged_results[('Mean BCE', 'std')] / np.sqrt(n_repeats))
    averaged_results['Error Range Combined'] = 1.96 * (averaged_results[('Mean Combined', 'std')] / np.sqrt(n_repeats))

    averaged_results.columns = ['Missingness', 'Method', 
                                'Mean RMSE', 'Std RMSE', 'Error Range RMSE',
                                'Mean BCE', 'Std BCE', 'Error Range BCE',
                                'Mean Combined', 'Std Combined', 'Error Range Combined']

    print("="*50)
    print(f"Missingness Level: {missing_prob * 100}%")
    print(averaged_results)


TRAIN AND INFERENCE ON I-NAA MODEL 

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from pyampute.ampute import MultivariateAmputation
from sklearn.metrics import mean_squared_error, log_loss
from tqdm import tqdm
import random
import os


import warnings
warnings.filterwarnings("ignore")

np.random.seed(42)
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # Check if using GPU

missingness_levels = [0.1, 0.2, 0.3]

n_repeats = 5  
n_splits = 5 

results = []

class INA_Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(INA_Autoencoder, self).__init__()

        # Undercomplete Representation
        hidden_dim1 = int(input_dim * 0.75)  
        hidden_dim2 = int(input_dim * 0.60)  
        hidden_dim3 = int(input_dim * 0.5) 
        

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim1),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim1, hidden_dim2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim2, hidden_dim3),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim3, hidden_dim2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim2, hidden_dim1),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim1, input_dim),
            nn.Sigmoid() 
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

class CustomLoss(nn.Module):
    def __init__(self, binary_indices, continuous_indices):
        super(CustomLoss, self).__init__()
        self.binary_indices = binary_indices
        self.continuous_indices = continuous_indices
        self.bce = nn.BCELoss() 

    def forward(self, outputs, targets):
        binary_outputs = outputs[:, self.binary_indices]
        binary_targets = targets[:, self.binary_indices]

        continuous_outputs = outputs[:, self.continuous_indices]
        continuous_targets = targets[:, self.continuous_indices]

        bce_loss = self.bce(binary_outputs, binary_targets)
        rmse_loss = torch.sqrt(torch.mean((continuous_outputs - continuous_targets) ** 2))
        total_loss = bce_loss + rmse_loss

        return total_loss

def train_inaa(train_data, test_data, binary_indices, continuous_indices, epochs=50, batch_size=32, learning_rate=0.001):
    k = random.randint(3, 8)
    knn_imputer = KNNImputer(n_neighbors=k)
    train_data_knn = knn_imputer.fit_transform(train_data)
    test_data_knn = knn_imputer.transform(test_data)

    train_tensor = torch.tensor(train_data_knn, dtype=torch.float32).to(device)
    test_tensor = torch.tensor(test_data_knn, dtype=torch.float32).to(device)

    train_loader = DataLoader(TensorDataset(train_tensor), batch_size=batch_size, shuffle=True)

    input_dim = train_data.shape[1]
    model = INA_Autoencoder(input_dim).to(device)  
    criterion = CustomLoss(binary_indices, continuous_indices).to(device)  
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for batch in train_loader:
            batch_data = batch[0].to(device) 
            optimizer.zero_grad()
            output = model(batch_data)
            loss = criterion(output, batch_data)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

    model.eval()
    with torch.no_grad():
        reconstructed_test_data = model(test_tensor).cpu().numpy()  

    missing_mask = np.isnan(test_data)  # True where values were missing
    imputed_test_data = test_data.copy()
    imputed_test_data[missing_mask] = reconstructed_test_data[missing_mask]

    return imputed_test_data

class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()

    def forward(self, outputs, targets):
        return torch.sqrt(torch.mean((outputs - targets) ** 2))  # RMSE


# Directory to save models
model_save_dir = "best_inaa_models"
os.makedirs(model_save_dir, exist_ok=True)

best_models = {}

for missing_prob in missingness_levels:
    print(f'Training on: {missing_prob * 100}%')
    best_combined_score = float('inf')  
    best_model_state = None

    rmse_results = {}
    results_per_n = []
    for _ in range(n_repeats):
        amputer = MultivariateAmputation(patterns=[{"mechanism": "MAR", "prob": missing_prob, "incomplete_vars": high_variance_features_list}])
        df_missing = amputer.fit_transform(df_complete)

        id_column = "ID" if "ID" in df_missing.columns else None
        df_features = df_missing.drop(columns=[id_column]) if id_column else df_missing

        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        
        for train_index, test_index in kf.split(df_features):
            train_df, test_df = df_features.iloc[train_index], df_features.iloc[test_index]
            train_complete, test_complete = df_complete.iloc[train_index], df_complete.iloc[test_index]

            data_range = test_complete.max() - test_complete.min()
            data_range[data_range == 0] = 1 

            rmse_results["INAA"] = []
            bce_results["INAA"] = []
            combined_results["INAA"] = []

            # Train INAA
            model = INA_Autoencoder(train_df.shape[1]).to(device)  # Initialize model
            imputed_test_inaa = train_inaa(train_df.values, test_df.values, binary_indices, continuous_indices)
            imputed_test_inaa_np = imputed_test_inaa
            missing_mask = test_df.isnull().to_numpy()

            continuous_rmse_inaa = []

            for col in continuous_columns:
                col_index = train_df.columns.get_loc(col)
                col_mask = missing_mask[:, col_index]
                if np.sum(col_mask) == 0:
                        continue

                rmse_inaa = mean_squared_error(
                    test_complete_np[col_mask, col_index],
                    imputed_test_inaa_np[col_mask, col_index],
                    squared=False
                )
         
                continuous_rmse_inaa.append(rmse_inaa)

            avg_rmse_inaa = np.mean(continuous_rmse_inaa)
            rmse_results["INAA"].append(avg_rmse_inaa)

            binary_bce_inaa = []
            for col in binary_columns:
                col_index = train_df.columns.get_loc(col)
                col_mask = missing_mask[:, col_index]

                if np.sum(col_mask) == 0:
                        continue
                
                pred_probs_inaa = np.clip(imputed_test_inaa_np[col_mask, col_index], 1e-10, 1 - 1e-10)
                
                if len(np.unique(test_complete_np[col_mask, col_index])) > 1:
                    bce_inaa = log_loss(
                        test_complete_np[col_mask, col_index],
                        pred_probs_inaa
                    )
                    binary_bce_inaa.append(bce_inaa)

            avg_bce_inaa = np.mean(binary_bce_inaa)
            bce_results["INAA"].append(avg_bce_inaa)

            combined_metric_inaa = np.sqrt((avg_rmse_inaa * avg_rmse_inaa) + (avg_bce_inaa * avg_bce_inaa))
            combined_results["INAA"].append(combined_metric_inaa)

            if combined_metric_inaa < best_combined_score:
                best_combined_score = combined_metric_inaa
                best_model_state = model.state_dict()

    # Save the best model for the missingness level
    if best_model_state:
        model_path = os.path.join(model_save_dir, f"best_INAA_MAR_{int(missing_prob*100)}.pth")
        torch.save(best_model_state, model_path)
        best_models[missing_prob] = model_path
        print(f"âœ… Best I-NAA model for {int(missing_prob*100)}% missingness saved at: {model_path}")   

        # Combine and Print Results
        for method in rmse_results:
            mean_rmse_per_k_fold = np.mean(rmse_results[method])
            std_rmse_per_k_fold = np.std(rmse_results[method])
            
            mean_bce_per_k_fold = np.mean(bce_results[method])
            std_bce_per_k_fold = np.std(bce_results[method])
            
            mean_combined_per_k_fold = np.mean(combined_results[method])
            std_combined_per_k_fold = np.std(combined_results[method])
            
            results_per_n.append({
                "Missingness": missing_prob,
                "Method": method,
                "Mean RMSE": mean_rmse_per_k_fold,
                "Std RMSE": std_rmse_per_k_fold,
                "Mean BCE": mean_bce_per_k_fold,
                "Std BCE": std_bce_per_k_fold,
                "Mean Combined": mean_combined_per_k_fold,
                "Std Combined": std_combined_per_k_fold
            })

    df_results = pd.DataFrame(results_per_n)

    averaged_results = df_results.groupby(["Missingness", "Method"]).agg({
        "Mean RMSE": ["mean", "std"],
        "Mean BCE": ["mean", "std"],
        "Mean Combined": ["mean", "std"]
    }).reset_index()

    averaged_results['Error Range RMSE'] = 1.96 * (averaged_results[('Mean RMSE', 'std')] / np.sqrt(n_repeats))
    averaged_results['Error Range BCE'] = 1.96 * (averaged_results[('Mean BCE', 'std')] / np.sqrt(n_repeats))
    averaged_results['Error Range Combined'] = 1.96 * (averaged_results[('Mean Combined', 'std')] / np.sqrt(n_repeats))

    averaged_results.columns = ['Missingness', 'Method', 
                                'Mean RMSE', 'Std RMSE', 'Error Range RMSE',
                                'Mean BCE', 'Std BCE', 'Error Range BCE',
                                'Mean Combined', 'Std Combined', 'Error Range Combined']


print("\nSummary of saved best I-NAA models per missingness level:")
for missing_prob, path in best_models.items():
    print(f"- {int(missing_prob*100)}% missingness: {path}")
    