In [2]:
import pyro
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import ClippedAdam
from pyro_mixture_model import MixtureModel, MixtureModelGuide

In [4]:
df = pd.read_parquet("data/CaseRigshospitalet_summed.parquet")


In [5]:
def summarize_columns(df):
    excluded_column = "embedding"
    columns_to_summarize = [c for c in df.columns if c != excluded_column]
    
    print(pd.DataFrame([
        (
            c,
            df[c].dtype,
            len(df[c].unique()),
            df[c].memory_usage(deep=True) // (1024**2)
        ) for c in columns_to_summarize
    ], columns=['name', 'dtype', 'unique', 'size (MB)']))
    
    print('Total size (excluding "embedding"):', 
          df[columns_to_summarize].memory_usage(deep=True).sum() / 1024**2, 'MB')

# Call the function
summarize_columns(df)


                           name     dtype  unique  size (MB)
0                    Patient ID  category  325666         30
1           Aktionsdiagnosekode  category    8125          1
2  totalDiagnoseKontaktVarighed   float32    7831          1
3                antalKontakter     int64     142          3
4                antalDiagnoser     int64      17          3
5                         alder   Float64   13870          4
6                        gender  category       4          0
7                    civilStand  category       9          0
8          distanceToHospitalKM   float64    1688          3
Total size (excluding "embedding"): 50.16899013519287 MB


In [6]:
def make_demographic_features(df):
    # Create dummy variables for categorical features
    df = pd.get_dummies(df, columns=["gender", "civilStand"], drop_first=True)
    
    # Ensure all columns in demographic_columns are numeric
    demographic_columns = ["alder", "distanceToHospitalKM"] + [col for col in df.columns if col.startswith("gender_") or col.startswith("civilStand_")]

    # Check for non-numeric columns
    for col in demographic_columns:
        if not pd.api.types.is_numeric_dtype(df[col]):
            print(f"Column {col} is not numeric. Converting to numeric.")
            df[col] = pd.to_numeric(df[col], errors="coerce")  # Convert to numeric, setting invalid values to NaN

    # Handle missing values (e.g., fill NaN with 0)
    df[demographic_columns] = df[demographic_columns].fillna(0)
    
    return df[demographic_columns].values.astype(np.float32)


In [7]:
# set random seed for reproducibility
np.random.seed(42)

# Convert list of lists to a 2D NumPy array
embedding_length = len(df["embedding"].iloc[0])  # Assuming all embeddings should have the same length
df["embedding"] = df["embedding"].apply(lambda x: x if len(x) == embedding_length else [0.0] * embedding_length)
embeddings = np.array(df["embedding"].tolist(), dtype=np.float32)
embedding_tensor = torch.from_numpy(embeddings)

# Make vector for each row with alder, gender, civilStand, distanceToHospitalKM,
demographic_data = torch.from_numpy(make_demographic_features(df))

# 1. Convert target to numpy
y_time = torch.from_numpy(df['totalDiagnoseKontaktVarighed'].values.astype(np.float32))
y_count = torch.from_numpy(df["antalKontakter"].values.astype(np.int16))

# 2. Randomly select 1000 indices
total_samples = 1000
all_indices = np.arange(len(embedding_tensor))

selected_indices = np.random.choice(all_indices, size=total_samples, replace=False)
df_subset = df.iloc[selected_indices]

# Select based on patient ID, can't have same patient in train and test
unique_patient_ids = df_subset['Patient ID'].unique()

train_patient_ids, test_patient_ids = train_test_split(
    unique_patient_ids, test_size=0.2, random_state=42
)
train_mask = df_subset['Patient ID'].isin(train_patient_ids)
test_mask = df_subset['Patient ID'].isin(test_patient_ids)

# Use the mask to filter rows
x_emb_train = embedding_tensor[selected_indices][train_mask.values]
d_demo_train = demographic_data[selected_indices][train_mask.values]
v_time_train = y_time[selected_indices][train_mask.values]
a_count_train = y_count[selected_indices][train_mask.values]

x_emb_test = embedding_tensor[selected_indices][test_mask.values]
d_demo_test = demographic_data[selected_indices][test_mask.values]
v_time_test = y_time[selected_indices][test_mask.values]
a_count_test = y_count[selected_indices][test_mask.values]

In [13]:
pyro.set_rng_seed(0)

# Setup SVI
optimizer = ClippedAdam({"lr": 1e-4})
svi = SVI(MixtureModel, MixtureModelGuide, optimizer, loss=Trace_ELBO())

# Example training loop
def train(num_steps=100000):
    for step in range(num_steps):
        for name, value in pyro.get_param_store().items():
            if torch.isnan(value).any():
                print(f"Parameter {name} contains NaN values: {value}")
        loss = svi.step(x_emb_train, d_demo_train, v_time_train, a_count_train)
        
        if step % 1000 == 0:
            print(f"Step {step} : loss = {loss}")

# To run:
train(100000)  # adjust steps
# After training, inspect pyro.param values for cluster parameters and q_alpha


Step 0 : loss = 5442133063.051021
Step 1000 : loss = 97970580.06409946
Step 2000 : loss = 1153587331.6007795
Step 3000 : loss = 767687.1374959529
Step 4000 : loss = 152972324.3349263
Step 5000 : loss = 70246080.5972772
Step 6000 : loss = 842245.0994409717


KeyboardInterrupt: 