In [1]:
# --- Optional: pin SymPy if you need 1.12 specifically (uncomment in notebooks) ---
!pip uninstall -y sympy
!pip install sympy==1.12

# ========================
# Standard Library
# ========================
import os
import random
from copy import deepcopy
from collections import defaultdict

# ========================
# Core Scientific Stack
# ========================
import numpy as np
import pandas as pd
from scipy import stats

# ========================
# Machine Learning Utilities
# ========================
from sklearn.model_selection import train_test_split

# ========================
# Deep Learning (PyTorch)
# ========================
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

# ========================
# Visualization
# ========================
import matplotlib.pyplot as plt
import seaborn as sns  # used for heatmaps; remove if you want matplotlib-only

Found existing installation: sympy 1.13.3
Uninstalling sympy-1.13.3:
  Successfully uninstalled sympy-1.13.3
Collecting sympy==1.12
  Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)
Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sympy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.8.0+cu126 requires sympy>=1.13.3, but you have sympy 1.12 which is incompatible.[0m[31m
[0mSuccessfully installed sympy-1.12


In [2]:
def seed_all(seed: int = 42, deterministic: bool = True):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    if deterministic:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    else:
        torch.backends.cudnn.deterministic = False
        torch.backends.cudnn.benchmark = True

In [3]:
seed_all()

# === Categorical Mapping Utilities ===
def BinCat2Num(df: pd.DataFrame) -> pd.DataFrame:
    mapping = {
        "Gender": {"Male": 0, "Female": 1},
        "Ethnic": {"Asian": 0, "African": 1, "Caucasian": 2, "Other": 3},
        "Base Drug Combo": {
            "FTC + TDF": 0, "3TC + ABC": 1, "FTC + TAF": 2,
            "DRV + FTC + TDF": 3, "FTC + RTVB + TDF": 4, "Other": 5
        },
        "Comp. INI": {"DTG": 0, "RAL": 1, "EVG": 2, "Not Applied": 3},
        "Comp. NNRTI": {"NVP": 0, "EFV": 1, "RPV": 2, "Not Applied": 3},
        "Extra PI": {"DRV": 0, "RTVB": 1, "LPV": 2, "RTV": 3, "ATV": 4, "Not Applied": 5},
        "Extra pk-En": {"Fasle": 0, "True": 1}
    }
    df = df.copy()
    for col, map_dict in mapping.items():
        if col in df.columns:
            mapped = df[col].map(map_dict)
            if mapped.isnull().any():
                bad_vals = df[col][mapped.isnull()].unique()
                raise ValueError(f"Unmapped values in column '{col}': {bad_vals}")
            df[col] = mapped.astype(int)
    return df

def BinCat2Str(df: pd.DataFrame) -> pd.DataFrame:
    reverse_mapping = {
        "Gender": {0: "Male", 1: "Female"},
        "Ethnic": {0: "Asian", 1: "African", 2: "Caucasian", 3: "Other"},
        "Base Drug Combo": {
            0: "FTC + TDF", 1: "3TC + ABC", 2: "FTC + TAF",
            3: "DRV + FTC + TDF", 4: "FTC + RTVB + TDF", 5: "Other"
        },
        "Comp. INI": {0: "DTG", 1: "RAL", 2: "EVG", 3: "Not Applied"},
        "Comp. NNRTI": {0: "NVP", 1: "EFV", 2: "RPV", 3: "Not Applied"},
        "Extra PI": {0: "DRV", 1: "RTVB", 2: "LPV", 3: "RTV", 4: "ATV", 5: "Not Applied"},
        "Extra pk-En": {0: "Fasle", 1: "True"}
    }
    df = df.copy()
    for col, map_dict in reverse_mapping.items():
        if col in df.columns:
            df[col] = df[col].map(map_dict)
    return df


In [4]:
seed_all()

# === Box-Cox Transformation Parameters ===
def compute_boxcox_params(df: pd.DataFrame,
                          columns: list = ["VL", "CD4", "Rel CD4"],
                          eps: float = 1e-3) -> dict:
    params = {}
    for col in columns:
        x = df[col].dropna().astype(float) + eps
        boxcox_transformed, lmbda = stats.boxcox(x)
        params[col] = {
            "lambda": lmbda,
            "min": boxcox_transformed.min(),
            "max": boxcox_transformed.max() - boxcox_transformed.min()
        }
    return params

# === Apply Box-Cox + Min-Max Normalization ===
def apply_boxcox_minmax_transform(df: pd.DataFrame,
                                   params: dict,
                                   columns: list = ["VL", "CD4", "Rel CD4"],
                                   eps: float = 1e-3) -> pd.DataFrame:
    df_transformed = deepcopy(df)
    for col in columns:
        mask = ~df_transformed[col].isna()
        shifted = df_transformed.loc[mask, col].astype(float) + eps
        boxcox_transformed = stats.boxcox(shifted, lmbda=params[col]["lambda"])
        scaled = (boxcox_transformed - params[col]["min"]) / params[col]["max"]
        df_transformed.loc[mask, col] = scaled
    return df_transformed

# === Inverse Box-Cox for PyTorch Tensors ===
def inverse_boxcox_torch(data: torch.Tensor, lmbda: float, eps: float = 1e-3) -> torch.Tensor:
    if lmbda != 0:
        return torch.exp(torch.log(lmbda * data + 1) / lmbda) - eps
    else:
        return torch.exp(data) - eps

# === Backtransform a Tensor to Original Values ===
def backtransform_art_tensor(tensor: torch.Tensor,
                              feature_names: list,
                              transform_params: dict,
                              real_columns: list = ["VL", "CD4", "Rel CD4"]) -> pd.DataFrame:
    tensor = tensor.clone()
    for col in real_columns:
        idx = feature_names.index(col)
        p = transform_params[col]
        tensor[:, idx] = tensor[:, idx] * p['max'] + p['min']
        tensor[:, idx] = inverse_boxcox_torch(tensor[:, idx], p['lambda'])
    return pd.DataFrame(tensor.detach().cpu().numpy(), columns=feature_names)


In [5]:
# === Set Seed for Reproducibility ===
seed_all()

# === Step 0–1: Load and Preprocess Raw Data ===
raw_url = "https://figshare.com/ndownloader/files/40584980"
All_Data = pd.read_csv(raw_url)
All_Data = All_Data.drop(['VL (M)', 'CD4 (M)', 'Drug (M)'], axis=1)

All_Data.replace({
    "Gender":          {1: "Male", 2: "Female"},
    "Ethnic":          {1: "Asian", 2: "African", 3: "Caucasian", 4: "Other"},
    "Base Drug Combo": {0: "FTC + TDF", 1: "3TC + ABC", 2: "FTC + TAF", 3: "DRV + FTC + TDF", 4: "FTC + RTVB + TDF", 5: "Other"},
    "Comp. INI":       {0: "DTG", 1: "RAL", 2: "EVG", 3: "Not Applied"},
    "Comp. NNRTI":     {0: "NVP", 1: "EFV", 2: "RPV", 3: "Not Applied"},
    "Extra PI":        {0: "DRV", 1: "RTVB", 2: "LPV", 3: "RTV", 4: "ATV", 5: "Not Applied"},
    "Extra pk-En":     {0: "Fasle", 1: "True"}
}, inplace=True)

In [6]:
# === Step 2–4: Transform Dataset ===
All_Data = All_Data.drop(['PatientID', 'Timestep'], axis = 1)
ART_Data_Num = BinCat2Num(All_Data)
art_transformation_params = compute_boxcox_params(ART_Data_Num)
ART_Data_Transformed = apply_boxcox_minmax_transform(ART_Data_Num, art_transformation_params)

In [7]:
###===>>>++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Copyright (c) 2021. by Nicholas Kuo & Sebastiano Babieri, UNSW.                     +
# All rights reserved. This file is part of the Health Gym, and is released under the +
# "MIT Lisence Agreement". Please see the LICENSE file that should have been included +
# as part of this package.                                                            +
###===###++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

###===>>>
# This is the 3rd of all files for WGAN on Sepsis

###===>>>
import  numpy               as  np
import  pandas              as  pd

import  itertools
import  random

import  torch
import  torch.utils.data    as  utils

Cur_Len   = 60
Feats_Len = 10
Pat_Len   = 8916

###===>>>
def Execute_C003(
        df, Hyper001_BatchSize,
        Cur_Len = Cur_Len, Feats_Len = Feats_Len, Pat_Len = Pat_Len):

    ###===>>>
    data = df.values

    data = data.reshape((-1, Cur_Len, Feats_Len))
    data = utils.TensorDataset(
                    torch.from_numpy(data).float(),
                    torch.full((Pat_Len, 1, 1), Cur_Len),
                )

    trn_loader = utils.DataLoader(
            data, batch_size=Hyper001_BatchSize, shuffle=True, drop_last=True
        )

    ###===>>>
    All_Trainable_Data = []

    #---
    for batch_idx, (x, _) in enumerate(trn_loader):
        All_Trainable_Data.append(x)

    #---
    All_Trainable_Data = torch.cat(All_Trainable_Data, dim = 1)

    ###===###
    return trn_loader, All_Trainable_Data

In [8]:
Hyper001_BatchSize  = 32

Train_Loader, All_Trainable_Data = Execute_C003(ART_Data_Transformed, Hyper001_BatchSize)

In [9]:
# === Feature Schema ===
dtype = pd.DataFrame([
          [0,           "VL",               "real",   1,              1,                  0,              1],
          [1,           "CD4",              "real",   1,              1,                  1,              2],
          [2,           "Rel_CD4",          "real",   1,              1,                  2,              3],
          [3,           "Gender",           "bin",    2,              2,                  3,              4],
          [4,           "Ethnic",           "cat",    4,              4,                  4,              5],
          [5,           "Base_Drug_Combo",  "cat",    6,              4,                  5,              6],
          [6,           "Comp_INI",         "cat",    4,              4,                  6,              7],
          [7,           "Comp_NNRTI",       "cat",    4,              4,                  7,              8],
          [8,           "Extra_PI",         "cat",    6,              4,                  8,              9],
          [9,           "Extra_pk_En",      "bin",    2,              2,                  9,              10],
],
columns = ["index",     "name",             "type",   "num_classes",  "embedding_size",   "index_start",  "index_end"])

In [10]:
dtype

Unnamed: 0,index,name,type,num_classes,embedding_size,index_start,index_end
0,0,VL,real,1,1,0,1
1,1,CD4,real,1,1,1,2
2,2,Rel_CD4,real,1,1,2,3
3,3,Gender,bin,2,2,3,4
4,4,Ethnic,cat,4,4,4,5
5,5,Base_Drug_Combo,cat,6,4,5,6
6,6,Comp_INI,cat,4,4,6,7
7,7,Comp_NNRTI,cat,4,4,7,8
8,8,Extra_PI,cat,6,4,8,9
9,9,Extra_pk_En,bin,2,2,9,10


In [11]:
# === Embedding Module ===
class ARTFeatureEmbedding(nn.Module):
    def __init__(self, feature_df):
        super().__init__()
        self.feature_df = feature_df
        self.embeddings = nn.ModuleDict()
        for _, row in self.feature_df.iterrows():
            name = row["name"]
            ftype = row["type"]
            in_size = int(row["num_classes"])
            out_size = int(row["embedding_size"])
            if ftype == "real":
                self.embeddings[name] = nn.Linear(1, out_size)
            elif ftype in ["cat", "bin"]:
                self.embeddings[name] = nn.Embedding(in_size, out_size)

    def forward(self, x):
        B, T, D = x.shape
        out_feats = []
        for _, row in self.feature_df.iterrows():
            name = row["name"]
            ftype = row["type"]
            start = int(row["index_start"])
            end = int(row["index_end"])
            x_f = x[:, :, start:end]
            if ftype == "real":
                x_f = self.embeddings[name](x_f.view(B * T, 1)).view(B, T, -1)
            else:
                x_f = x_f.squeeze(-1).long()
                num_classes = self.embeddings[name].num_embeddings
                x_f = torch.clamp(x_f, 0, num_classes - 1)
                x_f = self.embeddings[name](x_f)
            out_feats.append(x_f)
        return torch.cat(out_feats, dim=-1)

In [26]:
embedder = ARTFeatureEmbedding(dtype)

In [27]:
embedder

ARTFeatureEmbedding(
  (embeddings): ModuleDict(
    (VL): Linear(in_features=1, out_features=1, bias=True)
    (CD4): Linear(in_features=1, out_features=1, bias=True)
    (Rel_CD4): Linear(in_features=1, out_features=1, bias=True)
    (Gender): Embedding(2, 2)
    (Ethnic): Embedding(4, 4)
    (Base_Drug_Combo): Embedding(6, 4)
    (Comp_INI): Embedding(4, 4)
    (Comp_NNRTI): Embedding(4, 4)
    (Extra_PI): Embedding(6, 4)
    (Extra_pk_En): Embedding(2, 2)
  )
)

In [25]:
batch, _ = next(iter(Train_Loader))

torch.Size([32, 60, 10])
torch.Size([32, 60, 27])


In [28]:
cur_data_BatchLoc0 = batch[0].unsqueeze(0)

In [29]:
cur_data_BatchLoc0.shape

torch.Size([1, 60, 10])

In [18]:
Embedded_Using_Embedder = embedder(cur_data_BatchLoc0)

Embedded_Using_Embedder

tensor([[[ 0.0056,  0.2027, -0.1840,  ..., -0.0467,  1.3187,  0.5707],
         [-0.0064,  0.2843, -0.2765,  ..., -0.0467,  1.3187,  0.5707],
         [ 0.0063,  0.2593, -0.2513,  ..., -0.0467,  1.3187,  0.5707],
         ...,
         [ 0.0908,  0.1820, -0.1616,  ..., -0.0467,  1.3187,  0.5707],
         [ 0.0902,  0.1820, -0.1573,  ..., -0.0467,  1.3187,  0.5707],
         [ 0.0883,  0.1780, -0.1372,  ..., -0.0467,  1.3187,  0.5707]]],
       grad_fn=<CatBackward0>)

In [19]:
feature_df = dtype
embeddings = nn.ModuleDict()
print("###===###")
for _, row in feature_df.iterrows():
    name = row["name"]
    ftype = row["type"]
    in_size = int(row["num_classes"])
    out_size = int(row["embedding_size"])
    if ftype == "real":
        embeddings[name] = nn.Linear(1, out_size)
    elif ftype in ["cat", "bin"]:
        embeddings[name] = nn.Embedding(in_size, out_size)
    #---
    print(f"name:  {name}")
    print(f"ftype: {ftype}")
    print(f"(in_size, out_size): ({in_size}, {out_size})")
    print(f"embedding operation: {embeddings[name]}")
    print("#---")

###===###
name:  VL
ftype: real
(in_size, out_size): (1, 1)
embedding operation: Linear(in_features=1, out_features=1, bias=True)
#---
name:  CD4
ftype: real
(in_size, out_size): (1, 1)
embedding operation: Linear(in_features=1, out_features=1, bias=True)
#---
name:  Rel_CD4
ftype: real
(in_size, out_size): (1, 1)
embedding operation: Linear(in_features=1, out_features=1, bias=True)
#---
name:  Gender
ftype: bin
(in_size, out_size): (2, 2)
embedding operation: Embedding(2, 2)
#---
name:  Ethnic
ftype: cat
(in_size, out_size): (4, 4)
embedding operation: Embedding(4, 4)
#---
name:  Base_Drug_Combo
ftype: cat
(in_size, out_size): (6, 4)
embedding operation: Embedding(6, 4)
#---
name:  Comp_INI
ftype: cat
(in_size, out_size): (4, 4)
embedding operation: Embedding(4, 4)
#---
name:  Comp_NNRTI
ftype: cat
(in_size, out_size): (4, 4)
embedding operation: Embedding(4, 4)
#---
name:  Extra_PI
ftype: cat
(in_size, out_size): (6, 4)
embedding operation: Embedding(6, 4)
#---
name:  Extra_pk_En
fty

In [20]:
x          = cur_data_BatchLoc0
embeddings = embedder.embeddings
print("###===###")
B, T, D = x.shape
out_feats = []
for row_idx, row in feature_df.iterrows():
    name = row["name"]
    ftype = row["type"]
    start = int(row["index_start"])
    end = int(row["index_end"])
    x_f = x[:, :, start:end]
    if ftype == "real":
        x_f = embeddings[name](x_f.view(B * T, 1)).view(B, T, -1)
    else:
        x_f = x_f.squeeze(-1).long()
        num_classes = embeddings[name].num_embeddings
        x_f = torch.clamp(x_f, 0, num_classes - 1)
        x_f = embeddings[name](x_f)

    #---
    print("#---")
    print("Doing it explicitly will get you")
    print(x_f[0, :, 0])
    print("\n")
    print("which is identical to what we had previously")
    print(Embedded_Using_Embedder[0, :, row_idx])
    break

###===###
#---
Doing it explicitly will get you
tensor([ 0.0056, -0.0064,  0.0063,  0.0018, -0.0119, -0.0087,  0.0345,  0.0193,
         0.0365,  0.0471,  0.0336,  0.0404,  0.0450,  0.0464,  0.0176,  0.0458,
         0.0480,  0.0484,  0.0428,  0.0175,  0.0472,  0.0454,  0.0231,  0.0463,
         0.0426,  0.0449,  0.0464,  0.0472,  0.0479,  0.0480,  0.1153,  0.0466,
         0.0481,  0.0401,  0.0213,  0.0971,  0.0885,  0.0916,  0.1113,  0.0976,
         0.1077,  0.0908,  0.0914,  0.1047,  0.0913,  0.1020,  0.0912,  0.1074,
         0.0911,  0.0911,  0.0901,  0.1035,  0.0911,  0.0910,  0.0912,  0.0903,
         0.1127,  0.0908,  0.0902,  0.0883], grad_fn=<SelectBackward0>)


which is identical to what we had previously
tensor([ 0.0056, -0.0064,  0.0063,  0.0018, -0.0119, -0.0087,  0.0345,  0.0193,
         0.0365,  0.0471,  0.0336,  0.0404,  0.0450,  0.0464,  0.0176,  0.0458,
         0.0480,  0.0484,  0.0428,  0.0175,  0.0472,  0.0454,  0.0231,  0.0463,
         0.0426,  0.0449,  0.0464