In [1]:
import sys
import os
import logging
import pandas as pd
import datasets
from pprint import pprint
KEY = '1-CGMLSM'
WORKSPACE_PATH = os.getcwd().split(KEY)[0]
print(WORKSPACE_PATH); os.chdir(WORKSPACE_PATH)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

SPACE = {
    'DATA_RAW': f'_Data/0-Data_Raw',
    'DATA_RFT': f'_Data/1-Data_RFT',
    'DATA_CASE': f'_Data/2-Data_CASE',
    'DATA_CFDATA': f'_Data/3-Data_CFDATA',
    'DATA_SPLIT': f'_Data/4-Data_Split',
    'DATA_HFDATA': f'_Data/5-Data_HFData',
    'DATA_EXTERNAL': f'code/external',
    'CODE_FN': f'code/pipeline',
    'MODEL_ROOT': f'./_Model',
}
assert os.path.exists(SPACE['CODE_FN']), f'{SPACE["CODE_FN"]} not found'
print(SPACE['CODE_FN'])
sys.path.append(SPACE['CODE_FN'])

  from .autonotebook import tqdm as notebook_tqdm


/Users/floydluo/Desktop/cgmlsm-dev/
code/pipeline


In [2]:
from nn.cgmlsm.configuration_cgmlsm import CgmLsmConfig
from nn.cgmlsm.modeling_cgmlsm import CgmLsmLMHeadModel

In [4]:
# HFDataName = 'CgmLsm_WellDoc_ds0p10'
# path = os.path.join(SPACE['DATA_HFDATA'], HFDataName)
# split_to_dataset = datasets.load_from_disk(path)
# remove_unused_columns = True # if using the processed dataset, set to True. 
# print(split_to_dataset)
# Name_to_Data = {i: {'ds_tfm': split_to_dataset[i]} for i in split_to_dataset}
# # exit()

In [1]:
import sys
import os
import logging
import pandas as pd
import datasets
import numpy as np
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    RobertaConfig,
    RobertaForSequenceClassification,
    PreTrainedTokenizerFast,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from pprint import pprint
from transformers import (
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
# Calculate AUC (Area Under the ROC Curve)
# For multi-class, we use one-vs-rest approach
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# ---------------------- CF Vocab Setup ----------------------
CF_to_CFvocab = {} # data_config['CF_to_CFvocab']

CFName = 'HM5MinStep'
interval_delta = pd.Timedelta(minutes=5)
idx2tkn = [pd.Timestamp('2022-01-01 00:00:00') + interval_delta * i for i in range(24 * 12)]
idx2tkn = [f'{i.hour:02d}:{i.minute:02d}' for i in idx2tkn]
tkn2idx = {tkn: idx for idx, tkn in enumerate(idx2tkn)}
CF_to_CFvocab[CFName] = {'idx2tkn': idx2tkn, 'tkn2idx': tkn2idx}

CFName = 'CGMValue'
idx2tkn = ["PAD", "UNKNOWN", "MASK"] + [f'Other_{i}' for i in range(0, 7)] + [str(i) for i in range(10, 401)]
tkn2idx = {tkn: idx for idx, tkn in enumerate(idx2tkn)}
CF_to_CFvocab[CFName] = {'idx2tkn': idx2tkn, 'tkn2idx': tkn2idx}

# ---------------------- Tokenizer Setup ----------------------
idx2tkn = CF_to_CFvocab['CGMValue']['idx2tkn']
vocab_dict = {token: idx for idx, token in enumerate(idx2tkn)}
wordlevel = WordLevel(vocab=vocab_dict, unk_token="UNKNOWN")
tokenizer_backend = Tokenizer(wordlevel)
tokenizer_backend.pre_tokenizer = Whitespace()
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer_backend,
    unk_token="UNKNOWN",
    pad_token="PAD",
    mask_token="MASK"
)

In [6]:
len(tokenizer)

401

In [7]:
hm_idx2tkn = CF_to_CFvocab['HM5MinStep']['idx2tkn']

model_name = 'cgmlsm_pretrain_welldoc_v2024'
config = CgmLsmConfig(
    vocab_size=len(tokenizer),
)
model = CgmLsmLMHeadModel(config)
model

CgmLsmLMHeadModel(
  (transformer): CgmLsmModel(
    (wte): Embedding(401, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x AttnBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=401, bias=False)
)

In [8]:

def compute_metrics(pred):
    # Convert logits and labels to torch.Tensor
    logits = torch.tensor(pred.predictions)
    labels = torch.tensor(pred.label_ids)

    # Shift for next token prediction
    shift_logits = logits[..., :-1, :]
    shift_labels = labels[..., 1:]

    # Flatten the tensors
    shift_logits = shift_logits.reshape(-1, shift_logits.size(-1))
    shift_labels = shift_labels.reshape(-1)

    # Compute loss, ignoring padding tokens (-100)
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
    next_token_loss = loss_fct(shift_logits, shift_labels)

    # Compute accuracy
    predictions = torch.argmax(shift_logits, dim=-1)
    mask = shift_labels != -100
    correct = (predictions == shift_labels) & mask
    accuracy = correct.sum().float() / mask.sum().float()

    # Compute perplexity
    perplexity = torch.exp(next_token_loss)

    return {
        'next_token_loss': next_token_loss.item(),
        'perplexity': perplexity.item(),
        'accuracy': accuracy.item()
    }

In [9]:
# ---------------------- Training Arguments ----------------------
training_args = TrainingArguments(
    output_dir=os.path.join(SPACE['MODEL_ROOT'], model_name),

    do_train=True,
    do_eval=True,

    num_train_epochs=2,  # ← First run with 1 epoch
    per_device_train_batch_size=128,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,  # effective batch size = 64*4 = 256

    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=1000,
    max_grad_norm=1.0,

    logging_steps=1,

    # Evaluation settings
    eval_strategy="steps",
    eval_steps=200, # 1, # 200

    save_strategy="steps", # one epoch
    save_steps=0.1,
    save_total_limit=10,

    # No best model logic for now
    # load_best_model_at_end=True,
    # metric_for_best_model="perplexity",
    # greater_is_better=False,

    report_to="wandb",
    prediction_loss_only=False,
    remove_unused_columns=False,
    dataloader_drop_last=True,

    dataloader_num_workers=8,  # ← add this to use your CPUs
)


In [10]:
print([i for i in Name_to_Data.keys()])

['In-Train', 'In-Valid_T1D', 'In-Valid_T2D', 'In-Test_T1D', 'In-Test_T2D', 'Out_T1D', 'Out_T2D']


In [None]:
eval_set_size = 1042
random_seed = 42

ds_tfm_train  = Name_to_Data['In-Train']['ds_tfm']
ds_tfm_valid_t1d  = Name_to_Data['In-Valid_T1D']['ds_tfm'].shuffle(seed=random_seed).select(range(eval_set_size))
ds_tfm_valid_t2d = Name_to_Data['In-Valid_T2D']['ds_tfm'].shuffle(seed=random_seed).select(range(eval_set_size))


ds_tfm_testid_t1d  = Name_to_Data['In-Test_T1D']['ds_tfm'].shuffle(seed=random_seed).select(range(eval_set_size))
ds_tfm_testid_t2d = Name_to_Data['In-Test_T2D']['ds_tfm'].shuffle(seed=random_seed).select(range(eval_set_size))


ds_tfm_testod_t1d = Name_to_Data['Out_T1D']['ds_tfm'].shuffle(seed=random_seed).select(range(eval_set_size))
ds_tfm_testod_t2d = Name_to_Data['Out_T2D']['ds_tfm'].shuffle(seed=random_seed).select(range(eval_set_size))



eval_dict = {
    'valid_t1d': ds_tfm_valid_t1d,
    'valid_t2d': ds_tfm_valid_t2d,
    'testid_t1d': ds_tfm_testid_t1d,
    'testid_t2d': ds_tfm_testid_t2d,
    'testod_t1d': ds_tfm_testod_t1d,
    'testod_t2d': ds_tfm_testod_t2d,
}

print(ds_tfm_train)
print(eval_dict)