In [None]:
import pandas as pd

df_encoded = pd.read_pickle("df_encoded_files/df_encoded.pkl")

# Convert boolean data types to integers
df_encoded = df_encoded.astype({col: int for col in df_encoded.select_dtypes(include=['bool']).columns})

race_columns = [col for col in df_encoded.columns if col.startswith("PRIMARY_RACE") and col != "PRIMARY_RACE_Unknown"]
df_encoded.loc[df_encoded[race_columns].sum(axis=1) == 0, "PRIMARY_RACE_Unknown"] = 1
unknown_ethnicity_list = []

filtered_rows = df_encoded[df_encoded['PRIMARY_ETHNICITY_Unknown'] == 1]

i = 0
for index, row in filtered_rows.iterrows():
    unknown_ethnicity_list.append(index)


unknown_race_list = []

filtered_rows = df_encoded[df_encoded['PRIMARY_RACE_Unknown'] == 1]

i = 0
for index, row in filtered_rows.iterrows():
    unknown_race_list.append(index)

# List of columns that need to be converted
columns_to_convert = [
    'PRIMARY_RACE_American Indian / Native American',
    'PRIMARY_RACE_Asian',
    'PRIMARY_RACE_Asian Indian',
    'PRIMARY_RACE_Black / African American',
    'PRIMARY_RACE_Middle Eastern',
    'PRIMARY_RACE_Native Hawaiian / Pacific Islander',
    'PRIMARY_RACE_White'
]

existing_columns = [col for col in columns_to_convert if col in df_encoded.columns]

df_encoded[existing_columns] = df_encoded[existing_columns].apply(
    lambda col: col.astype(str).str.lower().map({'false': 0, 'true': 1}).fillna(0).astype(int)
)

print("✅ Specified object-type columns successfully converted to 0 and 1.")
print(df_encoded[existing_columns].dtypes) 

df_encoded["RPL_THEME1"].fillna(0, inplace=True)

✅ Specified object-type columns successfully converted to 0 and 1.
PRIMARY_RACE_American Indian / Native American     int64
PRIMARY_RACE_Asian                                 int64
PRIMARY_RACE_Asian Indian                          int64
PRIMARY_RACE_Black / African American              int64
PRIMARY_RACE_Middle Eastern                        int64
PRIMARY_RACE_Native Hawaiian / Pacific Islander    int64
PRIMARY_RACE_White                                 int64
dtype: object


In [None]:
from sklearn.impute import KNNImputer

# Define the columns to impute
columns_to_impute = [
    "min_BMI", "min_HEIGHT", "min_PULSE", "min_WEIGHT",
    "max_BMI", "max_HEIGHT", "max_PULSE", "max_WEIGHT",
    "mean_BMI", "mean_HEIGHT", "mean_PULSE", "mean_WEIGHT",
    "SYSTOLIC_BP_min", "SYSTOLIC_BP_max", "SYSTOLIC_BP_mean",
    "DIASTOLIC_BP_min", "DIASTOLIC_BP_max", "DIASTOLIC_BP_mean"
]

df_knn_impute = df_encoded.copy()
knn_imputer = KNNImputer(n_neighbors=5)
df_knn_impute[columns_to_impute] = knn_imputer.fit_transform(df_knn_impute[columns_to_impute])
df_encoded[columns_to_impute] = df_encoded[columns_to_impute].where(df_encoded.notna(), df_knn_impute[columns_to_impute])

In [None]:
import torch
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

import warnings
import torch

warnings.simplefilter(action='ignore', category=FutureWarning)


class INA_Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(INA_Autoencoder, self).__init__()

        # Undercomplete Representation: Smaller hidden layers than input
        hidden_dim1 = int(input_dim * 0.75)  
        hidden_dim2 = int(input_dim * 0.60)
        hidden_dim3 = int(input_dim * 0.5)   
        

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim1),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim1, hidden_dim2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim2, hidden_dim3),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim3, hidden_dim2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim2, hidden_dim1),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim1, input_dim),
            nn.Sigmoid()  
        )

    def forward(self, x):
        if torch.isnan(x).any():
            print("NaN detected in input")
        
        encoded = self.encoder(x)
        
        if torch.isnan(encoded).any():
            print("NaN detected in encoding")

        decoded = self.decoder(encoded)
        
        if torch.isnan(decoded).any():
            print("NaN detected in output")
        
        return decoded


# Define the columns to remove before imputation
unknown_columns = [
    'PRIMARY_ETHNICITY_Unknown',
    'PRIMARY_RACE_Unknown',
    'LANGUAGE_Unknown',
    'D_Insur_at_pull_Unknown',
]

df_impute = df_encoded.drop(columns=unknown_columns, errors='ignore').copy()
df_impute = df_impute.drop(columns=['ID'], errors='ignore')


# 🔹 Load the trained I-NAA model
model_path = "best_inaa_models/best_INAA_MAR_30.pth"


# Initialize the model with the correct input size (based on df_impute)
input_dim = df_impute.shape[1]
model = INA_Autoencoder(input_dim)  # Ensure the model matches feature count
model.load_state_dict(torch.load(model_path, map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu")))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

missing_rows = (df_encoded['RPL_THEME1'] == 0) | df_encoded['RPL_THEME1'].isna()
input_data = torch.tensor(df_impute.loc[missing_rows].values, dtype=torch.float32).to(device)

with torch.no_grad():
    imputed_data = model(input_data).cpu().numpy()  # Move back to CPU

df_impute.loc[missing_rows, df_impute.columns] = imputed_data

print("Imputation complete! _Unknown columns are preserved.")

df_encoded.loc[missing_rows, "RPL_THEME1"] = df_impute.loc[missing_rows, "RPL_THEME1"]

In [None]:
df_encoded.to_pickle("df_encoded_imputed_final.pkl")