In [None]:
import pandas as pd
import numpy as np
import pyreadr
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_curve, auc, roc_curve
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import timedelta
import random
import os
import boto3
import tempfile
from settings import settings

In [None]:
os.environ['AWS_ACCESS_KEY_ID'] = settings.AWS_ACCESS_KEY_ID
os.environ['AWS_SECRET_ACCESS_KEY'] = settings.AWS_SECRET_ACCESS_KEY
os.environ['AWS_DEFAULT_REGION'] = settings.AWS_DEFAULT_REGION


# Define S3 info
bucket_name = 'kehmisjan2025'
file_key = 'targets_apr23.rds'

# Initialize boto3 client
s3 = boto3.client('s3')

# Download to a temporary file
with tempfile.NamedTemporaryFile(suffix=".rds") as tmp_file:
    s3.download_fileobj(bucket_name, file_key, tmp_file)
    tmp_file.seek(0)  # go back to beginning
    result = pyreadr.read_r(tmp_file.name)  # returns a dictionary

# Extract the data frame
iit_data = next(iter(result.values()))  # assumes only one object inside

In [6]:
iit_data['NAD'] = pd.to_datetime(iit_data['NAD'], format='%Y-%m-%d')
start_exclude = pd.Timestamp('2024-10-01')
end_exclude = pd.Timestamp('2024-12-31')

# Filter out records from Sept through Dec 2024
iit_data = iit_data[~((iit_data['NAD'] >= start_exclude) & (iit_data['NAD'] <= end_exclude))]
iit_data['is_friday'] = iit_data['Day'].apply(lambda x: 1 if x == "Fri" else 0)
print(iit_data['VisitDate'].min(),iit_data['VisitDate'].max())
print(iit_data['NAD'].min(),iit_data['NAD'].max())

iit_data = iit_data.drop(columns=[
    'OptimizedHIVRegimen', 'Drug', 'VisitDate', 'WHO_Missing', 'Type',
    'most_recent_cd4', 'regimen_switch', 'AHD', 'NAD_Imputation_Flag',
    'BMI_Missing', 'TimeatFacility', 'Adherence_Missing', 'Facility_type_category',
    'Pregnant_Missing', 'Breastfeeding_Missing', 'Month', 'Day'
    # 'lastvd' to 'months_since_restart' would go here
    # 'Month', 'Day' handled below
    
])
selected_columns= ['num_late_last3', 'num_late14_last3', 'num_late30_last3',
       'num_late_last5', 'num_late14_last5', 'num_late30_last5',
       'num_late_last10', 'num_late14_last10', 'num_late30_last10']
iit_data[selected_columns] = iit_data[selected_columns].apply(pd.to_numeric, errors='coerce')
# Pregnant: Yes -> 1, No -> 0, else NA
iit_data['Pregnant'] = iit_data['Pregnant'].map({'Yes': 1, 'No': 0}).astype('Int64')

# Breastfeeding: Yes -> 1, No -> 0, else NA
iit_data['Breastfeeding'] = iit_data['Breastfeeding'].map({'Yes': 1, 'No': 0}).astype('Int64')

# ARTAdherence: good -> 1, poor/fair -> 0, else NA
iit_data['ARTAdherence'] = iit_data['ARTAdherence'].map({
    'good': 1,
    'poor': 0,
    'fair': 0
}).astype('Int64')

# Sex: Male -> 1, else 0
iit_data['Sex'] = (iit_data['Sex'] == 'Male').astype('Int64')

# Emr: KenyaEMR -> 1, else 0
iit_data['Emr'] = (iit_data['Emr'] == 'KenyaEMR').astype('Int64')  # assuming there is an 'Emr' column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iit_data['is_friday'] = iit_data['Day'].apply(lambda x: 1 if x == "Fri" else 0)


2021-01-04 2024-09-28
2022-01-01 00:00:00 2024-09-30 00:00:00


In [7]:
def encode_xgboost(dataset):
    # List of categorical variables to be encoded
    categorical_columns = [ 'BMI', 'WHOStage','most_recent_vl', 'MaritalStatus', 'EducationLevel','DifferentiatedCare',
       'Occupation', 'VisitBy','TCAReason', 'cascade_status', 'Kephlevel','Ownertype'] 
    
    # One-hot encoding the categorical columns
    ohe = pd.get_dummies(dataset[categorical_columns], drop_first=True, dtype=int)
    
    # Concatenate the original dataset (excluding categorical columns) with the one-hot encoded columns
    dataset_encoded = pd.concat([dataset.drop(columns=categorical_columns), ohe], axis=1)
    
    return dataset_encoded

In [None]:
# def encode_xgboost(dataset):
#     # List of categorical variables to be encoded
#     categorical_columns = ['most_recent_vl','DifferentiatedCare',
#         'VisitBy','cascade_status', 'Kephlevel','Ownertype'] 
    
#     # One-hot encoding the categorical columns
#     ohe = pd.get_dummies(dataset[categorical_columns], drop_first=True, dtype=int)
    
#     # Concatenate the original dataset (excluding categorical columns) with the one-hot encoded columns
#     dataset_encoded = pd.concat([dataset.drop(columns=categorical_columns), ohe], axis=1)
    
#     return dataset_encoded

In [None]:
# iit_data= iit_data.drop([ 'BMI', 'WHOStage','MaritalStatus', 'EducationLevel','Occupation','TCAReason', 'num_late30_last3',
#        'lateness_last5', 'num_late_last5', 'num_late14_last5',
#        'num_late30_last5', 'lateness_last10', 'num_late_last10',
#        'num_late14_last10', 'num_late30_last10','men_knowledge', 'women_knowledge',
#        'men_heardaids', 'men_highrisksex', 'men_highrisksex_multi',
#        'men_sexnotwithpartner', 'men_sexpartners', 'men_nevertested',
#        'men_testedrecent', 'men_sti', 'women_heardaids', 'women_highrisksex',
#        'women_highrisksex_multi', 'women_sexnotwithpartner',
#        'women_sexpartners', 'women_nevertested', 'women_testedrecent'], axis=1)

In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# -----------------------------
# Step 1: Temporal Split Function
# -----------------------------
def preprocess_split(df, start_date, end_date):
    split = df[(df["NAD"] >= start_date) & (df["NAD"] <= end_date)].copy()
    # Drop columns if they exist
    for col in ["SiteCode", "key", "NAD"]:
        if col in split.columns:
            split = split.drop(columns=col)
    X = split.drop(columns=["iit"])
    y = split["iit"]
    return X, y

# -----------------------------
# Step 2: Apply Temporal Splits
# -----------------------------
X_train_df, y_train = preprocess_split(iit_data, "2024-01-01", "2024-05-31")
X_val_df, y_val = preprocess_split(iit_data, "2024-06-01", "2024-06-30")
X_testnear_df, y_testnear = preprocess_split(iit_data, "2024-07-01", "2024-07-30")
X_test_df, y_test = preprocess_split(iit_data, "2024-07-01", "2024-09-30")

# -----------------------------
# Step 3: Impute Missing Values
# -----------------------------
numeric_cols = X_train_df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train_df.select_dtypes(exclude=np.number).columns.tolist()

imputer = ColumnTransformer([
    ('num', SimpleImputer(strategy='mean'), numeric_cols),
    ('cat', SimpleImputer(strategy='most_frequent'), categorical_cols)
])

X_train_imputed = imputer.fit_transform(X_train_df)
X_val_imputed = imputer.transform(X_val_df)
X_testnear_imputed = imputer.transform(X_testnear_df)
X_test_imputed = imputer.transform(X_test_df)

# -----------------------------
# Step 4: Rebuild DataFrames
# -----------------------------
def rebuild_df(data, num_cols, cat_cols):
    return pd.DataFrame(data, columns=num_cols + cat_cols)

X_train_df = rebuild_df(X_train_imputed, numeric_cols, categorical_cols)
X_val_df = rebuild_df(X_val_imputed, numeric_cols, categorical_cols)
X_testnear_df = rebuild_df(X_testnear_imputed, numeric_cols, categorical_cols)
X_test_df = rebuild_df(X_test_imputed, numeric_cols, categorical_cols)

# -----------------------------
# Step 5: One-Hot Encode 
# -----------------------------
X_train_encoded = encode_xgboost(X_train_df)
X_val_encoded = encode_xgboost(X_val_df)
X_testnear_encoded = encode_xgboost(X_testnear_df)
X_test_encoded = encode_xgboost(X_test_df)

# -----------------------------
# Step 6: Align One-Hot Encoded Columns
# -----------------------------
all_cols = X_train_encoded.columns
X_val_encoded = X_val_encoded.reindex(columns=all_cols, fill_value=0)
X_testnear_encoded = X_testnear_encoded.reindex(columns=all_cols, fill_value=0)
X_test_encoded = X_test_encoded.reindex(columns=all_cols, fill_value=0)

# -----------------------------
# Step 7: Scale Numeric Columns
# -----------------------------
# Identify numeric columns from the encoded DataFrame that were originally numeric
final_numeric_cols = [col for col in numeric_cols if col in X_train_encoded.columns]

scaler = StandardScaler()
X_train_encoded[final_numeric_cols] = scaler.fit_transform(X_train_encoded[final_numeric_cols])
X_val_encoded[final_numeric_cols] = scaler.transform(X_val_encoded[final_numeric_cols])
X_testnear_encoded[final_numeric_cols] = scaler.transform(X_testnear_encoded[final_numeric_cols])
X_test_encoded[final_numeric_cols] = scaler.transform(X_test_encoded[final_numeric_cols])

# -----------------------------
# Step 8: Convert to PyTorch Tensors
# -----------------------------
X_train = torch.tensor(X_train_encoded.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)

X_val = torch.tensor(X_val_encoded.values, dtype=torch.float32)
y_val = torch.tensor(y_val.values, dtype=torch.float32)

X_testnear = torch.tensor(X_testnear_encoded.values, dtype=torch.float32)
y_testnear = torch.tensor(y_testnear.values, dtype=torch.float32)

X_test = torch.tensor(X_test_encoded.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)



  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [None]:
import torch
import torch.nn as nn
from sklearn.metrics import roc_auc_score, average_precision_score

# Model definition
class TabularTransformer(nn.Module):
    def __init__(self, num_features, dim=4, heads=1, depth=1, dropout=0.1):
        super().__init__()
        self.embedding = nn.Linear(num_features, dim) #Projects the original tabular input into a dense vector space of size dim.
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim, nhead=heads, dropout=dropout, batch_first=True
        ) #Transformer encoder layers
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth)
        self.classifier = nn.Sequential(
            nn.Linear(dim, 16), # linear embedding layer
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)  # Add sequence dimension for transformer
        x = self.transformer(x)
        x = x.mean(dim=1)  # Global average pooling
        return self.classifier(x).squeeze()


In [7]:
from torch.utils.data import TensorDataset, DataLoader

# Instantiate model
model = TabularTransformer(num_features=X_train.shape[1])
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# Create datasets
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Modify training loop
for epoch in range(10):
    model.train()
    epoch_loss = 0.0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        y_pred = model(batch_X)
        loss = criterion(y_pred, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {epoch_loss / len(train_loader):.4f}")



Epoch 1, Loss: 0.1229
Epoch 2, Loss: 0.1189
Epoch 3, Loss: 0.1185
Epoch 4, Loss: 0.1184
Epoch 5, Loss: 0.1183
Epoch 6, Loss: 0.1182
Epoch 7, Loss: 0.1181
Epoch 8, Loss: 0.1181
Epoch 9, Loss: 0.1180
Epoch 10, Loss: 0.1180


In [None]:
# Evaluation function 
def evaluate(model, X, y, label=""):
    model.eval()
    with torch.no_grad():
        preds = model(X)
        preds_np = preds.numpy()
        y_np = y.numpy()

        roc_auc = roc_auc_score(y_np, preds_np)
        auc_pr = average_precision_score(y_np, preds_np)

        print(f"{label} ROC-AUC: {roc_auc:.2f}")
        print(f"{label} AUC-PR: {auc_pr:.2f}")

# Evaluate on validation set (June)
evaluate(model, X_testnear, y_testnear, label="test near")

# Evaluate on test set (July–Sept)
evaluate(model, X_test, y_test, label="Test")

test near ROC-AUC: 0.77
test near AUC-PR: 0.13
Test ROC-AUC: 0.75
Test AUC-PR: 0.12


Reshaping the dataset

In [None]:
# import torch
# import torch.nn as nn
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import StandardScaler
# from sklearn.impute import SimpleImputer
# from sklearn.metrics import roc_auc_score, average_precision_score
# from torch.utils.data import Dataset, DataLoader
# from collections import defaultdict
# from sklearn.compose import ColumnTransformer
# # -------------------------------
# # Step 1: Preprocessing - Impute, Encode, and Scale
# # -------------------------------

# def preprocess_features(df, target_col="iit"):
#     features = df.drop(columns=[target_col, "SiteCode", "NAD", "key"], errors="ignore")
#     target = df[target_col]

#     numeric_cols = features.select_dtypes(include=np.number).columns.tolist()
#     categorical_cols = features.select_dtypes(exclude=np.number).columns.tolist()

#     # Impute
#     imputer = ColumnTransformer([
#         ('num', SimpleImputer(strategy='mean'), numeric_cols),
#         ('cat', SimpleImputer(strategy='most_frequent'), categorical_cols)
#     ])
#     imputed = imputer.fit_transform(features)
#     imputed_df = pd.DataFrame(imputed, columns=numeric_cols + categorical_cols)

#     # One-hot encode
#     encoded_df = pd.get_dummies(imputed_df, columns=categorical_cols)

#     # Scale numeric
#     scaler = StandardScaler()
#     encoded_df[numeric_cols] = scaler.fit_transform(encoded_df[numeric_cols])

#     return encoded_df, target, numeric_cols, encoded_df.columns

# # -------------------------------
# # Step 2: Convert to Sequences
# # -------------------------------
# #Builds sequences of visits per patient (key), sorted by date (NAD).
# # Extracts features as a sequence



# def create_sequences(df, feature_cols, label_col="iit", max_len=None): 
#     df = df.sort_values(["key", "NAD"])
#     grouped = df.groupby("key")

#     sequences = []
#     labels = []

#     for key, group in grouped:
#         feats = group[feature_cols].values
#         label = group[label_col].values[-1]  
#         if max_len:
#             if len(feats) < max_len:
#                 pad_len = max_len - len(feats)
#                 padding = np.zeros((pad_len, feats.shape[1]))
#                 feats = np.vstack([padding, feats])
#             else:
#                 feats = feats[-max_len:]
#         sequences.append(feats)
#         labels.append(label)

#     return torch.tensor(sequences, dtype=torch.float32), torch.tensor(labels, dtype=torch.float32)

# # -------------------------------
# # Step 3: Transformer Model for Sequences
# # -------------------------------

# class VisitTransformer(nn.Module):
#     def __init__(self, input_dim, model_dim=32, heads=2, depth=2, dropout=0.1):
#         super().__init__()
#         self.embedding = nn.Linear(input_dim, model_dim)
#         encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=heads, dropout=dropout, batch_first=True)
#         self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth)
#         self.classifier = nn.Sequential(
#             nn.Linear(model_dim, 16),
#             nn.ReLU(),
#             nn.Dropout(dropout),
#             nn.Linear(16, 1),
#             nn.Sigmoid()
#         )

#     def forward(self, x):
#         x = self.embedding(x)  # shape: [batch, seq_len, model_dim]
#         x = self.transformer(x)  # same shape
#         x = x.mean(dim=1)  # average over sequence
#         return self.classifier(x).squeeze()

# # -------------------------------
# # Step 4: Dataset Wrapper
# # -------------------------------

# class SequenceDataset(Dataset):
#     def __init__(self, X_seq, y):
#         self.X = X_seq
#         self.y = y

#     def __len__(self):
#         return len(self.X)

#     def __getitem__(self, idx):
#         return self.X[idx], self.y[idx]

# # -------------------------------
# # Step 5: Training the Model
# # -------------------------------

# def train_model(model, dataloader, val_loader, epochs=10):
#     optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
#     criterion = nn.BCELoss()

#     for epoch in range(epochs):
#         model.train()
#         total_loss = 0
#         for X_batch, y_batch in dataloader:
#             optimizer.zero_grad()
#             preds = model(X_batch)
#             loss = criterion(preds, y_batch)
#             loss.backward()
#             optimizer.step()
#             total_loss += loss.item()

#         print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")
#         evaluate(model, val_loader)

# # -------------------------------
# # Step 6: Evaluation
# # -------------------------------

# def evaluate(model, loader):
#     model.eval()
#     all_preds, all_labels = [], []
#     with torch.no_grad():
#         for X, y in loader:
#             preds = model(X)
#             all_preds.extend(preds.cpu().numpy())
#             all_labels.extend(y.cpu().numpy())

#     roc = roc_auc_score(all_labels, all_preds)
#     pr = average_precision_score(all_labels, all_preds)
#     print(f"ROC-AUC: {roc:.3f}, AUC-PR: {pr:.3f}")
# # -------------------------------
# # Step 7: Execution
# # -------------------------------

# # Preprocess full dataset
# full_features, full_target, num_cols, feat_cols = preprocess_features(iit_data)

# # Append back identifiers
# iit_data_proc = pd.concat([iit_data[["key", "NAD", "iit"]].reset_index(drop=True), full_features.reset_index(drop=True)], axis=1)

# # Split into train/val/test
# train_df = iit_data_proc[(iit_data_proc["NAD"] >= "2024-01-01") & (iit_data_proc["NAD"] <= "2024-05-31")]
# val_df = iit_data_proc[(iit_data_proc["NAD"] >= "2024-06-01") & (iit_data_proc["NAD"] <= "2024-06-30")]
# test_df = iit_data_proc[(iit_data_proc["NAD"] >= "2024-07-01") & (iit_data_proc["NAD"] <= "2024-09-30")]

# # Define max sequence length
# MAX_SEQ_LEN = 10  # Adjust based on your data distribution

# # Create sequences
# X_train_seq, y_train_seq = create_sequences(train_df, feat_cols, max_len=MAX_SEQ_LEN)
# X_val_seq, y_val_seq = create_sequences(val_df, feat_cols, max_len=MAX_SEQ_LEN)
# X_test_seq, y_test_seq = create_sequences(test_df, feat_cols, max_len=MAX_SEQ_LEN)

# # DataLoaders
# train_loader = DataLoader(SequenceDataset(X_train_seq, y_train_seq), batch_size=64, shuffle=True)
# val_loader = DataLoader(SequenceDataset(X_val_seq, y_val_seq), batch_size=64)
# test_loader = DataLoader(SequenceDataset(X_test_seq, y_test_seq), batch_size=64)

# # Initialize and train model
# model = VisitTransformer(input_dim=X_train_seq.shape[2])
# train_model(model, train_loader, val_loader, epochs=10)

# # Final evaluation
# evaluate(model, test_loader)



In [None]:
# # -------------------------------
# # Step 7: Execution
# # -------------------------------

# # Preprocess full dataset
# full_features, full_target, num_cols, feat_cols = preprocess_features(iit_data)

# # Append back identifiers
# iit_data_proc = pd.concat([iit_data[["key", "NAD", "iit"]].reset_index(drop=True), full_features.reset_index(drop=True)], axis=1)

# # Split into train/val/test
# train_df = iit_data_proc[(iit_data_proc["NAD"] >= "2024-01-01") & (iit_data_proc["NAD"] <= "2024-05-31")]
# val_df = iit_data_proc[(iit_data_proc["NAD"] >= "2024-06-01") & (iit_data_proc["NAD"] <= "2024-06-30")]
# test_df = iit_data_proc[(iit_data_proc["NAD"] >= "2024-07-01") & (iit_data_proc["NAD"] <= "2024-09-30")]

# # Define max sequence length
# MAX_SEQ_LEN = 10  # Adjust based on your data distribution

# # Create sequences
# X_train_seq, y_train_seq = create_sequences(train_df, feat_cols, max_len=MAX_SEQ_LEN)
# X_val_seq, y_val_seq = create_sequences(val_df, feat_cols, max_len=MAX_SEQ_LEN)
# X_test_seq, y_test_seq = create_sequences(test_df, feat_cols, max_len=MAX_SEQ_LEN)

# # DataLoaders
# train_loader = DataLoader(SequenceDataset(X_train_seq, y_train_seq), batch_size=64, shuffle=True)
# val_loader = DataLoader(SequenceDataset(X_val_seq, y_val_seq), batch_size=64)
# test_loader = DataLoader(SequenceDataset(X_test_seq, y_test_seq), batch_size=64)

# # Initialize and train model
# model = VisitTransformer(input_dim=X_train_seq.shape[2])
# train_model(model, train_loader, val_loader, epochs=10)

# # Final evaluation
# evaluate(model, test_loader)

In [9]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score
from torch.utils.data import Dataset, DataLoader
from sklearn.compose import ColumnTransformer
from datetime import datetime

# -------------------------------
# Step 1: Preprocessing
# -------------------------------
def preprocess_features(df, target_col="iit"):
    features = df.drop(columns=[target_col, "SiteCode", "NAD", "key"], errors="ignore")
    target = df[target_col]

    numeric_cols = features.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = features.select_dtypes(exclude=np.number).columns.tolist()

    imputer = ColumnTransformer([
        ('num', SimpleImputer(strategy='mean'), numeric_cols),
        ('cat', SimpleImputer(strategy='most_frequent'), categorical_cols)
    ])
    imputed = imputer.fit_transform(features)
    imputed_df = pd.DataFrame(imputed, columns=numeric_cols + categorical_cols)

    encoded_df = pd.get_dummies(imputed_df, columns=categorical_cols)

    scaler = StandardScaler()
    encoded_df[numeric_cols] = scaler.fit_transform(encoded_df[numeric_cols])

    return encoded_df, target, numeric_cols, encoded_df.columns

# -------------------------------
# Step 2: Build Sequences
# -------------------------------
def create_sequences(df, feature_cols, label_col="label_for_eval", max_len=10):
    sequences, labels = [], []
    for key, group in df.groupby("key"):
        group = group.sort_values("NAD")
        features = group[feature_cols].values
        labels_seq = group[label_col].values

        for i in range(1, len(features) + 1):
            seq = features[max(0, i - max_len):i]
            label = labels_seq[i - 1]

            if np.isnan(label):
                continue

            if len(seq) < max_len:
                pad_len = max_len - len(seq)
                seq = np.pad(seq, ((pad_len, 0), (0, 0)), mode="constant")
            sequences.append(seq)
            labels.append(label)

    return torch.tensor(sequences, dtype=torch.float32), torch.tensor(labels, dtype=torch.float32)

# -------------------------------
# Step 3: Transformer Model
# -------------------------------
class VisitTransformer(nn.Module):
    def __init__(self, input_dim, model_dim=32, heads=2, depth=2, dropout=0.1):
        super().__init__()
        self.embedding = nn.Linear(input_dim, model_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=heads, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth)
        self.classifier = nn.Sequential(
            nn.Linear(model_dim, 16),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = x.mean(dim=1)
        return self.classifier(x).squeeze()

# -------------------------------
# Step 4: Dataset Wrapper
# -------------------------------
class SequenceDataset(Dataset):
    def __init__(self, X_seq, y):
        self.X = X_seq
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# -------------------------------
# Step 5: Training & Evaluation
# -------------------------------
def train_model(model, dataloader, val_loader, epochs=10):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.BCELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in dataloader:
            optimizer.zero_grad()
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")
        evaluate(model, val_loader)

def evaluate(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for X, y in loader:
            preds = model(X)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())

    roc = roc_auc_score(all_labels, all_preds)
    pr = average_precision_score(all_labels, all_preds)
    print(f"ROC-AUC: {roc:.3f}, AUC-PR: {pr:.3f}")

# -------------------------------
# Step 6: Helper for History + Labels
# -------------------------------
def get_split_data(df, label_start, label_end):
    label_start = pd.to_datetime(label_start)
    label_end = pd.to_datetime(label_end)

    target_keys = df[(df["NAD"] >= label_start) & (df["NAD"] <= label_end)]["key"].unique()
    history = df[(df["key"].isin(target_keys)) & (df["NAD"] <= label_end)].copy()

    label_visits = df[(df["key"].isin(target_keys)) & (df["NAD"] >= label_start) & (df["NAD"] <= label_end)][["key", "NAD"]]
    label_visits["label_marker"] = 1

    history = pd.merge(history, label_visits, on=["key", "NAD"], how="left")
    history["label_marker"] = history["label_marker"].fillna(0)
    history["label_for_eval"] = history["iit"].where(history["label_marker"] == 1, np.nan)

    return history

# -------------------------------
# Step 7: Run Everything
# -------------------------------
# Preprocess
full_features, full_target, num_cols, feat_cols = preprocess_features(iit_data)
iit_data_proc = pd.concat([iit_data[["key", "NAD", "iit"]].reset_index(drop=True),
                           full_features.reset_index(drop=True)], axis=1)
iit_data_proc["NAD"] = pd.to_datetime(iit_data_proc["NAD"])

MAX_SEQ_LEN = 10

# ---- Train ----
train_df = iit_data_proc[
    (iit_data_proc["NAD"] >= pd.to_datetime("2023-08-01")) &
    (iit_data_proc["NAD"] <= pd.to_datetime("2024-05-31"))
].copy()
train_df["label_for_eval"] = train_df["iit"]
X_train_seq, y_train_seq = create_sequences(train_df, feat_cols, max_len=MAX_SEQ_LEN)

# ---- Validation ----
val_df = get_split_data(iit_data_proc, "2024-06-01", "2024-06-30")
X_val_seq, y_val_seq = create_sequences(val_df, feat_cols, max_len=MAX_SEQ_LEN)

# ---- Test ----
test_df = get_split_data(iit_data_proc, "2024-07-01", "2024-09-30")
X_test_seq, y_test_seq = create_sequences(test_df, feat_cols, max_len=MAX_SEQ_LEN)

# ---- Loaders ----
train_loader = DataLoader(SequenceDataset(X_train_seq, y_train_seq), batch_size=64, shuffle=True)
val_loader = DataLoader(SequenceDataset(X_val_seq, y_val_seq), batch_size=64)
test_loader = DataLoader(SequenceDataset(X_test_seq, y_test_seq), batch_size=64)

# ---- Model ----
model = VisitTransformer(input_dim=X_train_seq.shape[2])
train_model(model, train_loader, val_loader, epochs=10)

# ---- Final Eval ----
print("\nFinal Evaluation on Test Set:")
evaluate(model, test_loader)


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return torch.tensor(sequences, dtype=torch.float32), torch.tensor(labels, dtype=torch.float32)


Epoch 1, Loss: 0.1244
ROC-AUC: 0.750, AUC-PR: 0.099
Epoch 2, Loss: 0.1223
ROC-AUC: 0.762, AUC-PR: 0.117
Epoch 3, Loss: 0.1216
ROC-AUC: 0.748, AUC-PR: 0.113
Epoch 4, Loss: 0.1214
ROC-AUC: 0.750, AUC-PR: 0.104
Epoch 5, Loss: 0.1212
ROC-AUC: 0.752, AUC-PR: 0.120
Epoch 6, Loss: 0.1208
ROC-AUC: 0.754, AUC-PR: 0.126
Epoch 7, Loss: 0.1206
ROC-AUC: 0.752, AUC-PR: 0.122
Epoch 8, Loss: 0.1204
ROC-AUC: 0.757, AUC-PR: 0.127
Epoch 9, Loss: 0.1203
ROC-AUC: 0.757, AUC-PR: 0.129
Epoch 10, Loss: 0.1202
ROC-AUC: 0.748, AUC-PR: 0.122

Final Evaluation on Test Set:
ROC-AUC: 0.726, AUC-PR: 0.091
