In [1]:
# ========================
# Standard Library
# ========================
import os
import random
import itertools
from copy import deepcopy
from collections import defaultdict
from typing import Dict, List, Tuple, Optional

# ========================
# Core Scientific Stack
# ========================
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from scipy import stats
from scipy.special import boxcox as sp_boxcox

# ========================
# Machine Learning Utilities
# ========================
from sklearn.model_selection import train_test_split

# ========================
# Deep Learning (PyTorch)
# ========================
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import autograd
from torch.utils.data import DataLoader, TensorDataset

# ========================
# Visualization
# ========================
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
def seed_all(seed: int = 42, deterministic: bool = True) -> None:
    import os, random, numpy as np, torch

    # Python built-ins
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

    # NumPy
    np.random.seed(seed)

    # PyTorch
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    # cuDNN settings
    torch.backends.cudnn.deterministic = deterministic
    torch.backends.cudnn.benchmark = not deterministic

    if torch.cuda.is_available():
        torch.backends.cuda.matmul.allow_tf32 = False
        torch.backends.cudnn.allow_tf32 = False


In [3]:
seed_all()

# === Categorical Mapping Utilities (simple & tidy) ===
BINCAT_STR2INT = {
    "Gender": {"Male": 0, "Female": 1},
    "Ethnic": {"Asian": 0, "African": 1, "Caucasian": 2, "Other": 3},
    "Base Drug Combo": {
        "FTC + TDF": 0, "3TC + ABC": 1, "FTC + TAF": 2,
        "DRV + FTC + TDF": 3, "FTC + RTVB + TDF": 4, "Other": 5
    },
    "Extra PI": {"DRV": 0, "RTVB": 1, "LPV": 2, "RTV": 3, "ATV": 4, "Not Applied": 5},
    "Extra pk-En": {"False": 0, "True": 1},   # ← fixed typo
}

BINCAT_INT2STR = {
    "Gender": {0: "Male", 1: "Female"},
    "Ethnic": {0: "Asian", 1: "African", 2: "Caucasian", 3: "Other"},
    "Base Drug Combo": {
        0: "FTC + TDF", 1: "3TC + ABC", 2: "FTC + TAF",
        3: "DRV + FTC + TDF", 4: "FTC + RTVB + TDF", 5: "Other"
    },
    "Extra PI": {0: "DRV", 1: "RTVB", 2: "LPV", 3: "RTV", 4: "ATV", 5: "Not Applied"},
    "Extra pk-En": {0: "False", 1: "True"},   # ← fixed typo
}

def BinCat2Num(df: pd.DataFrame, allow_already_int: bool = True) -> pd.DataFrame:
    """
    Map categorical string columns → integer codes (0..K-1).
    If allow_already_int=True, columns already coded as ints in the valid range are left as-is.
    """
    out = df.copy()
    for col, mapping in BINCAT_STR2INT.items():
        if col not in out.columns:
            continue
        s = out[col]
        if allow_already_int and pd.api.types.is_integer_dtype(s):
            # Validate codes are within allowed range
            allowed = set(mapping.values())
            bad = set(pd.Series(s.dropna().unique(), dtype=int)) - allowed
            if bad:
                raise ValueError(f"Unexpected integer codes in '{col}': {sorted(bad)}")
            continue
        mapped = s.map(mapping)
        if mapped.isnull().any():
            bad_vals = s[mapped.isnull()].unique()
            raise ValueError(f"Unmapped values in column '{col}': {bad_vals}")
        out[col] = mapped.astype(int)
    return out

def BinCat2Str(df: pd.DataFrame) -> pd.DataFrame:
    """Map categorical integer codes → string labels."""
    out = df.copy()
    for col, mapping in BINCAT_INT2STR.items():
        if col in out.columns:
            out[col] = out[col].map(mapping)
    return out


In [4]:
seed_all()

# ========================
# Fit Box–Cox parameters
# ========================
def compute_boxcox_params(
    df: pd.DataFrame,
    columns: List[str] = ("VL", "CD4"),
    eps: float = 1e-3,
) -> Dict[str, dict]:
    """
    Fit Box–Cox λ per column on (x + eps), and store min/range of the
    transformed values for later [0,1] scaling.
    """
    params: Dict[str, dict] = {}
    for col in columns:
        x = df[col].dropna().astype(float).to_numpy()
        x = x + eps  # ensure positivity

        # Handle empty/degenerate columns safely
        if x.size == 0 or not np.all(np.isfinite(x)) or np.min(x) <= 0 or np.ptp(x) == 0:
            params[col] = {"lambda": 1.0, "min": 0.0, "range": 1.0, "eps": eps}
            continue

        bc, lam = stats.boxcox(x)  # returns transformed values + MLE λ
        bc_min = float(np.min(bc))
        bc_rng = float(np.max(bc) - bc_min) or 1.0  # avoid zero range

        params[col] = {"lambda": float(lam), "min": bc_min, "range": bc_rng, "eps": eps}
    return params


# ========================
# Box–Cox + Min–Max → [0,1]
# ========================
def apply_boxcox_minmax_transform(
    df: pd.DataFrame,
    params: Dict[str, dict],
    columns: List[str] = ("VL", "CD4"),
) -> pd.DataFrame:
    """
    Apply Box–Cox with fitted λ, then scale to [0,1] via stored min/range.
    NaNs are preserved.
    """
    out = df.copy()
    for col in columns:
        if col not in out.columns:
            continue
        p = params[col]
        mask = out[col].notna().to_numpy()
        if not mask.any():
            continue

        x = out.loc[mask, col].astype(float).to_numpy() + p["eps"]
        bc = sp_boxcox(x, p["lambda"])  # fixed-λ forward transform
        scaled = (bc - p["min"]) / p["range"]
        out.loc[mask, col] = scaled
    return out


# ========================
# Inverse Box–Cox (Torch)
# ========================
def inverse_boxcox_torch(bc: torch.Tensor, lmbda: float, eps: float = 1e-3) -> torch.Tensor:
    """
    Invert Box–Cox (on Box–Coxed values) to original x (minus eps shift).
    """
    if lmbda == 0.0:
        return torch.exp(bc) - eps
    base = torch.clamp(lmbda * bc + 1.0, min=1e-12)  # numeric safety
    return torch.pow(base, 1.0 / lmbda) - eps


# ========================
# Back-transform features
# ========================
def backtransform_art_tensor(
    tensor: torch.Tensor,
    feature_names: List[str],
    transform_params: Dict[str, dict],
    real_columns: List[str] = ("VL", "CD4"),
) -> pd.DataFrame:
    """
    Expects `tensor` with real cols in scaled Box–Cox space ([0,1]).
    Steps: unscale to Box–Cox → inverse Box–Cox → subtract eps.
    """
    x = tensor.detach().clone()
    idx_map = {n: i for i, n in enumerate(feature_names)}

    for col in real_columns:
        if col not in idx_map:
            continue
        i = idx_map[col]
        p = transform_params[col]
        bc = x[:, i] * p["range"] + p["min"]
        x[:, i] = inverse_boxcox_torch(bc, p["lambda"], p["eps"])

    return pd.DataFrame(x.cpu().numpy(), columns=feature_names)

In [5]:
seed_all()

# === Load and Preprocess Raw Data ===
raw_url = "https://figshare.com/ndownloader/files/40584980"
DROP_COLS = ["VL (M)", "CD4 (M)", "Drug (M)"]

# 1) Read only the columns we need
_all_cols = pd.read_csv(raw_url, nrows=0).columns.tolist()
usecols = [c for c in _all_cols if c not in DROP_COLS]
All_Data = pd.read_csv(raw_url, usecols=usecols)

# 2) Map numeric codes -> human-readable labels (with typo fixed)
NUM2STR = {
    "Gender":          {1: "Male", 2: "Female"},
    "Ethnic":          {1: "Asian", 2: "African", 3: "Caucasian", 4: "Other"},
    "Base Drug Combo": {
        0: "FTC + TDF", 1: "3TC + ABC", 2: "FTC + TAF",
        3: "DRV + FTC + TDF", 4: "FTC + RTVB + TDF", 5: "Other"
    },
    "Comp. INI":       {0: "DTG", 1: "RAL", 2: "EVG", 3: "Not Applied"},
    "Comp. NNRTI":     {0: "NVP", 1: "EFV", 2: "RPV", 3: "Not Applied"},
    "Extra PI":        {0: "DRV", 1: "RTVB", 2: "LPV", 3: "RTV", 4: "ATV", 5: "Not Applied"},
    "Extra pk-En":     {0: "False", 1: "True"},   # <-- fixed typo
}

for col, mapping in NUM2STR.items():
    if col in All_Data.columns:
        mapped = All_Data[col].map(mapping)
        if mapped.isnull().any():
            bad = All_Data.loc[mapped.isnull(), col].unique()
            raise ValueError(f"Unmapped codes in '{col}': {bad}")
        All_Data[col] = mapped.astype("category")

for cont in ("VL", "CD4"):
    if cont in All_Data.columns:
        All_Data[cont] = pd.to_numeric(All_Data[cont], errors="coerce")


In [6]:
dtype = pd.DataFrame([
    ["VL",            "real", 1, 1,  0,  1],
    ["CD4",           "real", 1, 1,  1,  2],
    ["Gender",        "bin",  2, 2,  2,  4],
    ["Ethnic",        "cat",  4, 4,  4,  8],
    ["Base_Drug_Combo","cat", 6, 4,  8, 14],
    ["Extra_PI",      "cat",  6, 4, 14, 20],
    ["Extra_pk_En",   "bin",  2, 2, 20, 22],
], columns=[
    "name","type","num_classes",
    "embedding_size","index_start","index_end"
])

In [16]:
dtype

Unnamed: 0,name,type,num_classes,embedding_size,index_start,index_end
0,VL,real,1,1,0,1
1,CD4,real,1,1,1,2
2,Gender,bin,2,2,2,4
3,Ethnic,cat,4,4,4,8
4,Base_Drug_Combo,cat,6,4,8,14
5,Extra_PI,cat,6,4,14,20
6,Extra_pk_En,bin,2,2,20,22


In [8]:
seed_all()

def Execute_C003(
    df: pd.DataFrame,   batch_size: int,  cur_len: int = 10):
    x_np = df.to_numpy(dtype=np.float32, copy=False)
    n_rows, feats_len = x_np.shape
    n_seq = n_rows // cur_len
    x_np = x_np.reshape(n_seq, cur_len, feats_len)
    x_t = torch.from_numpy(x_np)
    lengths = torch.full((n_seq,), cur_len, dtype=torch.long)

    dataset = TensorDataset(x_t, lengths)
    trn_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True, drop_last=True,)

    all_batches = [xb for xb, _ in trn_loader]
    all_trainable = torch.cat(all_batches, dim=0)
    return trn_loader, all_trainable


In [9]:
seed_all()

Sub_Data = BinCat2Num(All_Data)
Sub_Data = Sub_Data.rename(columns={
    "Base Drug Combo": "Base_Drug_Combo",
    "Extra PI": "Extra_PI",
    "Extra pk-En": "Extra_pk_En",
})

REAL_COLS = [r["name"] for _, r in dtype.query("type == 'real'").iterrows()]
art_transformation_params = compute_boxcox_params(Sub_Data, columns=REAL_COLS)

def expand_feature_names_from_dtype(dtype_df: pd.DataFrame):
    names = []
    for _, r in dtype_df.iterrows():
        if r["type"] == "real":
            names.append(r["name"])
        else:
            names += [f"{r['name']}_{j}" for j in range(int(r["num_classes"]))]
    return names

FEATURE_NAMES = expand_feature_names_from_dtype(dtype)

def design_from_dtype(df: pd.DataFrame, dtype_df: pd.DataFrame, params: dict) -> pd.DataFrame:
    real_cols = [r["name"] for _, r in dtype_df.query("type == 'real'").iterrows()]
    cat_blocks = [(r["name"], int(r["num_classes"])) for _, r in dtype_df.query("type != 'real'").iterrows()]
    X_real = apply_boxcox_minmax_transform(df[real_cols].copy(), params, columns=real_cols)
    X_cat_parts = []
    for name, k in cat_blocks:
        s = df[name].astype("Int64")
        s = pd.Categorical(s, categories=list(range(k)))
        dmy = pd.get_dummies(s, prefix=name, prefix_sep="_", dtype=np.int8)
        for j in range(k):
            if f"{name}_{j}" not in dmy.columns:
                dmy[f"{name}_{j}"] = 0
        X_cat_parts.append(dmy[[f"{name}_{j}" for j in range(k)]])
    X_cat = pd.concat(X_cat_parts, axis=1) if X_cat_parts else pd.DataFrame(index=df.index)
    return pd.concat([X_real, X_cat], axis=1).reindex(columns=FEATURE_NAMES, fill_value=0)


In [12]:
seed_all()
def slice_by_len(df: pd.DataFrame, L: int) -> pd.DataFrame:
    if "Timestep" not in df.columns:
        raise KeyError("Expected 'Timestep' column in input dataframe.")
    return df.loc[df["Timestep"] < L].reset_index(drop=True)

dfL = slice_by_len(Sub_Data, 10)
XL = design_from_dtype(dfL, dtype, art_transformation_params)

In [15]:
XL.head(20)

Unnamed: 0,VL,CD4,Gender_0,Gender_1,Ethnic_0,Ethnic_1,Ethnic_2,Ethnic_3,Base_Drug_Combo_0,Base_Drug_Combo_1,...,Base_Drug_Combo_4,Base_Drug_Combo_5,Extra_PI_0,Extra_PI_1,Extra_PI_2,Extra_PI_3,Extra_PI_4,Extra_PI_5,Extra_pk_En_0,Extra_pk_En_1
0,0.583964,0.696381,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,1,0
1,0.582164,0.63962,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,1,0
2,0.580872,0.639069,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,1,0
3,0.579139,0.68216,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,1,0
4,0.581044,0.67419,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,1,0
5,0.578952,0.633937,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,1,0
6,0.576747,0.675619,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,1,0
7,0.573253,0.689904,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,1,0
8,0.575278,0.631985,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,1,0
9,0.574505,0.628342,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,1,0


In [10]:
seed_all()

CURRICULUM_LENGTHS = [10, 20, 30, 40, 50, 60]
BATCH_SIZE = 256

only10_60 = {}
for L in CURRICULUM_LENGTHS:
    dfL = slice_by_len(Sub_Data, L)
    XL = design_from_dtype(dfL, dtype, art_transformation_params)
    loader_L, _ = Execute_C003(XL, batch_size=BATCH_SIZE, cur_len=L)
    only10_60[L] = loader_L


In [11]:
for L in CURRICULUM_LENGTHS:
    xb, _ = next(iter(only10_60[L]))
    print("#---" * 5)
    print(L, xb.shape)

#---#---#---#---#---
10 torch.Size([256, 10, 22])
#---#---#---#---#---
20 torch.Size([256, 20, 22])
#---#---#---#---#---
30 torch.Size([256, 30, 22])
#---#---#---#---#---
40 torch.Size([256, 40, 22])
#---#---#---#---#---
50 torch.Size([256, 50, 22])
#---#---#---#---#---
60 torch.Size([256, 60, 22])
