In [1]:
from huggingface_hub import login
login()   # follows the prompt to paste your HF token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
!pip install --upgrade datasets -q

In [3]:
# Remove the old cache just in case (optional, but good practice if you switch methods)
!rm -rf /content/hf_cache
!rm -rf ~/.cache/huggingface/datasets/nico8771___swda_processed # Clear default cache too if needed

from datasets import load_dataset
import os

# Just load the dataset without specifying cache_dir
dataset = load_dataset(
    "nico8771/swda_processed"
    # No cache_dir argument here
)

print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Generating train split:   0%|          | 0/192386 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3272 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4078 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['caller', 'text', 'act_tag', 'conversation_no', 'speaker_change'],
        num_rows: 192386
    })
    validation: Dataset({
        features: ['caller', 'text', 'act_tag', 'conversation_no', 'speaker_change'],
        num_rows: 3272
    })
    test: Dataset({
        features: ['caller', 'text', 'act_tag', 'conversation_no', 'speaker_change'],
        num_rows: 4078
    })
})


In [4]:
from collections import Counter
import pandas as pd
import torch

# ------------------------------------------------------------------
# 1.  Per-split class statistics
# ------------------------------------------------------------------
def split_stats(ds_dict, label_key="act_tag"):
    rows = []
    all_counter = Counter()
    for split, ds in ds_dict.items():
        counter = Counter(ds[label_key])
        total = sum(counter.values())
        for lbl, cnt in counter.items():
            rows.append({"split": split, "label": lbl,
                         "count": cnt,
                         "pct": 100.0 * cnt / total})
        all_counter.update(counter)

    # Global totals (optional)
    grand_total = sum(all_counter.values())
    for lbl, cnt in all_counter.items():
        rows.append({"split": "ALL", "label": lbl,
                     "count": cnt,
                     "pct": 100.0 * cnt / grand_total})

    return pd.DataFrame(rows)

stats_df = split_stats(dataset)
print(stats_df.head())          # sanity check

# ------------------------------------------------------------------
# 2.  Inverse-√freq class-weight tensor
# ------------------------------------------------------------------
label_list = sorted({row["label"] for row in stats_df.to_dict("records")})
freq = torch.tensor(
    [stats_df[(stats_df.label == lbl) & (stats_df.split == "ALL")]["count"].values[0]
     for lbl in label_list],
    dtype=torch.float,
)
class_weights = 1.0 / torch.sqrt(freq)
class_weights /= class_weights.mean()          # (optional) re-normalize
print(class_weights)                           # plug into CrossEntropyLoss

   split  label  count        pct
0  train     20    857   0.445459
1  train     31    631   0.327986
2  train     38  25087  13.039930
3  train      5  10770   5.598120
4  train     37  72549  37.710124
tensor([0.1671, 0.7659, 2.1470, 0.8733, 0.6582, 0.1953, 2.0193, 0.7545, 1.1103,
        1.4313, 0.1053, 0.7857, 0.2986, 2.0291, 0.6674, 0.6346, 0.5698, 1.1929,
        2.3169, 0.4053, 0.6946, 1.3729, 2.3317, 0.5881, 0.7076, 1.1850, 0.5556,
        1.2220, 0.3741, 1.9635, 0.8588, 0.8040, 1.4245, 0.4633, 2.3024, 0.2996,
        0.5901, 0.0751, 0.1267, 2.0291, 1.9038])


In [5]:
from collections import defaultdict
from datasets import Dataset, DatasetDict
from tqdm.auto import tqdm

CHUNK_SIZE = 64

# 1) Build a conversation‐level Dataset: one row per conversation
def make_conversation_dataset(split_ds):
    conv_accum = defaultdict(lambda: {
        "caller": [],
        "text": [],
        "act_tag": [],
        "speaker_change": [],
    })
    for row in tqdm(
        split_ds,
        desc="Grouping into conversations",
        leave=False
    ):
        cid = row["conversation_no"]
        conv_accum[cid]["caller"].append(row["caller"])
        conv_accum[cid]["text"].append(row["text"])
        conv_accum[cid]["act_tag"].append(row["act_tag"])
        conv_accum[cid]["speaker_change"].append(row["speaker_change"])
    # flatten into a list of dicts
    conv_list = []
    for cid, fields in conv_accum.items():
        fields["conversation_no"] = cid
        conv_list.append(fields)
    return Dataset.from_list(conv_list)

# 2) Chunk each conversation‐row into windows of up to CHUNK_SIZE
def chunk_conversation(batch):
    all_texts          = batch["text"][0]
    all_callers        = batch["caller"][0]
    all_act_tags       = batch["act_tag"][0]
    all_speaker_change = batch["speaker_change"][0]
    cid                = batch["conversation_no"][0]

    out = {
        "text": [], "caller": [], "act_tag": [],
        "speaker_change": [], "conversation_no": [],
    }

    for start in range(0, len(all_texts), CHUNK_SIZE):
        end = start + CHUNK_SIZE
        out["text"].append(all_texts[start:end])
        out["caller"].append(all_callers[start:end])
        out["act_tag"].append(all_act_tags[start:end])
        out["speaker_change"].append(all_speaker_change[start:end])
        out["conversation_no"].append(cid)

    return out

# 3) Apply to each split
raw = dataset  # your original DatasetDict

# First turn utterances → conversations
conv_ds = {}
for split in ["train", "validation", "test"]:
    conv_ds[split] = make_conversation_dataset(raw[split])

# Then chunk each conversation
chunked = DatasetDict()
for split in ["train", "validation", "test"]:
    chunked[split] = (
        conv_ds[split]
        .map(
            chunk_conversation,
            batched=True,
            batch_size=1,
            num_proc=4,
            remove_columns=conv_ds[split].column_names,
            desc=f"Chunking {split}"
        )
    )


Grouping into conversations:   0%|          | 0/192386 [00:00<?, ?it/s]

Grouping into conversations:   0%|          | 0/3272 [00:00<?, ?it/s]

Grouping into conversations:   0%|          | 0/4078 [00:00<?, ?it/s]

Chunking train (num_proc=4):   0%|          | 0/1115 [00:00<?, ? examples/s]

Chunking validation (num_proc=4):   0%|          | 0/21 [00:00<?, ? examples/s]

Chunking test (num_proc=4):   0%|          | 0/19 [00:00<?, ? examples/s]

In [6]:
import torch
from transformers import AutoTokenizer

tokenizer    = AutoTokenizer.from_pretrained("roberta-base")
PAD_LABEL_ID = -100  # your label-padding ID


def collate_fn(batch):
    bs = len(batch)

    # 1) Utterance‐level pad up to CHUNK_SIZE
    padded_texts, padded_speaker, padded_labels, utt_masks = [], [], [], []
    for ex in batch:
        texts = ex["text"]
        spch  = ex["speaker_change"]
        tags  = ex["act_tag"]
        n = len(texts)
        pad_n = CHUNK_SIZE - n

        padded_texts.append(texts + [""] * pad_n)
        padded_speaker.append(spch  + [0] * pad_n)
        padded_labels.append(tags  + [PAD_LABEL_ID] * pad_n)
        utt_masks.append([1] * n + [0] * pad_n)  # 1=real, 0=padded

    # 2) Flatten all utterances into one list (bs * CHUNK_SIZE)
    flat_utts = sum(padded_texts, [])

    # 3) Tokenize in one batch
    enc       = tokenizer(
        flat_utts,
        padding="longest",
        truncation=True,
        return_tensors="pt"
    )
    flat_ids  = enc["input_ids"]       # [bs*CHUNK_SIZE, seq_len]
    flat_mask = enc["attention_mask"]  # [bs*CHUNK_SIZE, seq_len]

    # 4) Reshape back to [bs, CHUNK_SIZE, seq_len]
    seq_len        = flat_ids.size(-1)
    input_ids      = flat_ids.view(bs, CHUNK_SIZE, seq_len)
    attention_mask = flat_mask.view(bs, CHUNK_SIZE, seq_len)

    # 5) Build speaker_change, labels, and utt_mask tensors
    speaker_change = torch.tensor(padded_speaker, dtype=torch.long)     # [bs, CHUNK_SIZE]
    labels         = torch.tensor(padded_labels,  dtype=torch.long)     # [bs, CHUNK_SIZE]
    utt_mask       = torch.tensor(utt_masks,      dtype=torch.long)     # [bs, CHUNK_SIZE]

    return {
        "input_ids":      input_ids,
        "attention_mask": attention_mask,
        "speaker_change": speaker_change,
        "utt_mask":       utt_mask,
        "labels":         labels,
    }


In [7]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    chunked["train"],
    batch_size=2,           # or whatever fits your GPU
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=2           # to parallelize I/O
)

valid_loader = DataLoader(
    chunked["validation"],
    batch_size=2,            # or whatever you choose
    shuffle=False,           # no need to shuffle validation
    collate_fn=collate_fn,
    num_workers=2
)


In [8]:
# model.py
import torch
import torch.nn as nn
from transformers import AutoModel
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class DASeq2SeqClassifier(nn.Module):
    """
    Seq-to-seq DA tagger with
      • teacher forcing + scheduled sampling at train-time
      • lightweight beam-search decoding at eval-time
      • two loss modes: token-level CE or sequence-level risk
    """

    def __init__(
        self,
        num_labels: int,
        hidden_size: int = 128,
        pad_label_id: int = -100,
        bos_label_id: int = 0,
        teacher_forcing_ratio: float = 1,
        beam_size: int = 5,

    ):
        super().__init__()
        self.roberta = AutoModel.from_pretrained("roberta-base")
        enc_dim = self.roberta.config.hidden_size

        self.turn_emb = nn.Embedding(2, enc_dim)

        self.encoder_gru = nn.GRU(
            input_size=enc_dim,
            hidden_size=hidden_size,
            bidirectional=True,
            batch_first=True,
        )

        self.ctx_proj = nn.Linear(hidden_size * 2, hidden_size)
        self.init_dec = nn.Linear(hidden_size * 2, hidden_size)

        self.label_emb = nn.Embedding(num_labels, hidden_size)
        self.decoder_gru = nn.GRU(
            input_size=hidden_size * 2,
            hidden_size=hidden_size,
            batch_first=True,
        )
        self.classifier = nn.Linear(hidden_size, num_labels)

        self.loss_fct = nn.CrossEntropyLoss(weight=class_weights, ignore_index=pad_label_id)

        self.pad_label_id = pad_label_id
        self.bos_label_id = bos_label_id
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.beam_size = beam_size

    def forward(
        self,
        input_ids,
        attention_mask,
        speaker_change,
        utt_mask,
        labels=None,
    ):
        bs, cs, sl = input_ids.size()
        # encode utterances
        flat = input_ids.view(bs*cs, sl)
        flat_mask = attention_mask.view(bs*cs, sl)
        h = self.roberta(flat, attention_mask=flat_mask).last_hidden_state[:,0]
        utt_emb = h.view(bs,cs,-1) + self.turn_emb(speaker_change)

        lengths = utt_mask.sum(dim=1).cpu()
        packed = pack_padded_sequence(utt_emb, lengths, batch_first=True, enforce_sorted=False)
        enc_out,_ = self.encoder_gru(packed)
        enc_out, _ = pad_packed_sequence(enc_out, batch_first=True, total_length=cs)

        ctx = self.ctx_proj(enc_out)           # [bs,cs,hidden]
        init = torch.tanh(self.init_dec(enc_out[:,0]))  # [bs,hidden]
        s_dec = init.unsqueeze(0)              # [1,bs,hidden]

        output = {}

        # TRAINING PHASE (both CE and risk)
        if labels is not None:
            # teacher-forced / scheduled sampling CE
            bos = torch.full((bs,1), self.bos_label_id, device=labels.device, dtype=torch.long)
            shifted = torch.cat([bos, labels[:,:-1]], dim=1)
            prev = bos.squeeze(1)
            logits_seq = []
            for t in range(cs):
                use_gold = torch.rand(bs, device=labels.device) < self.teacher_forcing_ratio
                inp = torch.where(use_gold, shifted[:,t], prev)
                emb = self.label_emb(inp.clamp(0, self.label_emb.num_embeddings-1))
                dec_in = torch.cat([emb, ctx[:,t]], dim=-1).unsqueeze(1)
                out, s_dec = self.decoder_gru(dec_in, s_dec)
                logit = self.classifier(out.squeeze(1))
                logits_seq.append(logit)
                prev = logit.argmax(-1)
            logits = torch.stack(logits_seq, dim=1)
            output['logits'] = logits
            loss = self.loss_fct(logits.view(-1,logits.size(-1)), labels.view(-1))
            output['loss'] = loss



        # INFERENCE PHASE (beam search only)
        if not self.training:
            preds = torch.full((bs,cs), self.pad_label_id, device=input_ids.device, dtype=torch.long)
            for i in range(bs):
                beams = [(0.0,
                          s_dec[:,i:i+1].clone(),
                          torch.full((), self.bos_label_id, device=input_ids.device, dtype=torch.long),
                          []
                         )]
                for t in range(cs):
                    new_beams = []
                    for lp_sum, h_prev, prev_lbl, seq in beams:
                        emb = self.label_emb(prev_lbl.clamp(0, self.label_emb.num_embeddings-1)).unsqueeze(0)
                        dec_in = torch.cat([emb, ctx[i,t].unsqueeze(0)], dim=-1).unsqueeze(1)
                        out, h_new = self.decoder_gru(dec_in, h_prev)
                        logit = self.classifier(out.squeeze(1))
                        lp = torch.log_softmax(logit, dim=-1).squeeze(0)
                        top_lp, top_idx = lp.topk(self.beam_size)
                        for lval, idx in zip(top_lp, top_idx):
                            new_beams.append((lp_sum + lval.item(), h_new.clone(), idx, seq + [idx.item()]))
                    new_beams.sort(key=lambda x: x[0], reverse=True)
                    beams = new_beams[:self.beam_size]
                best_seq = max(beams, key=lambda x: x[0])[3]
                preds[i,:len(best_seq)] = torch.tensor(best_seq, device=input_ids.device)
            output['pred_seq'] = preds

        return output


In [None]:
import os
import time
import torch
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
from transformers import AutoConfig
from huggingface_hub import hf_hub_download, upload_file
from torch.nn.utils import clip_grad_norm_


# ── 1) Settings ─────────────────────────────────────────────────────────────────
# repo_id    = "nico8771/swda-bigruberta-beam-lr4-nodecay"
repo_id    = "nico8771/swda-bigruberta-beam-lr4-256"
save_dir   = "./da_model"
os.makedirs(save_dir, exist_ok=True)

# the name of the checkpoint you expect on the Hub
ckpt_fname = "checkpoint_epoch06.pt"

device     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 1
lr         = 1e-4
weight_decay=1e-5
max_grad_norm = 5.0
teacher_forcing_ratio=1

# ── 2) Download & load existing checkpoint (if any) ──────────────────────────────
try:
    ckpt_path = hf_hub_download(
        repo_id=repo_id,
        filename=ckpt_fname,
        repo_type="model",
        revision="main",
    )
    ckpt = torch.load(ckpt_path, map_location="cpu")
    start_epoch = ckpt["epoch"] + 1
    print(f"Resuming from epoch {start_epoch}")
except Exception:
    # no checkpoint on HF Hub: start from scratch
    ckpt = None
    start_epoch = 1
    print("No existing checkpoint found; starting from epoch 1")

# ── 3) Rebuild & load your model ─────────────────────────────────────────────────

model  = DASeq2SeqClassifier(
    num_labels=41,
    hidden_size=256,
    teacher_forcing_ratio=teacher_forcing_ratio
)

if ckpt is not None:
    model.load_state_dict(ckpt["model_state"])
model.to(device)

# ── 4) Freeze RoBERTa + turn embeddings ─────────────────────────────────────────
# Unfreeze the embeddings only if you want (typically keep them frozen longer)
model.roberta.embeddings.requires_grad_(False)
model.turn_emb.weight.requires_grad = True

# Total number of encoder layers
total_layers = len(model.roberta.encoder.layer) #total layers = 12
n_layers_to_unfreeze = 12

for p in model.roberta.parameters():
    p.requires_grad = True


if n_layers_to_unfreeze > 0:
    for i in range(total_layers - n_layers_to_unfreeze, total_layers):
        for param in model.roberta.encoder.layer[i].parameters():
            param.requires_grad = True


trainable = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(trainable, lr=lr, weight_decay=weight_decay)


# ── 5) (Re)load optimizer state if checkpoint existed ───────────────────────────
# if ckpt is not None and "optim_state" in ckpt:
#     optimizer.load_state_dict(ckpt["optim_state"])

# ── 6) Training + Validation loops ──────────────────────────────────────────────
for epoch in range(start_epoch, start_epoch + num_epochs):

    # n_layers_to_unfreeze += 3

    # if n_layers_to_unfreeze > 0:
    #     for i in range(total_layers - n_layers_to_unfreeze, total_layers):
    #         for param in model.roberta.encoder.layer[i].parameters():
    #             param.requires_grad = True

    # trainable = [p for p in model.parameters() if p.requires_grad]
    # optimizer = torch.optim.AdamW(trainable, lr=lr, weight_decay=weight_decay)


    model.train()
    train_losses = []
    t0 = time.time()
    for batch in tqdm(train_loader, desc=f"[Epoch {epoch}] Train"):
        batch = {k: v.to(device) for k, v in batch.items() if torch.is_tensor(v)}
        optimizer.zero_grad()
        out  = model(
            batch["input_ids"],
            batch["attention_mask"],
            batch["speaker_change"],
            batch["utt_mask"],
            labels=batch["labels"],
        )
        loss = out["loss"]
        loss.backward()
        clip_grad_norm_(trainable, max_grad_norm)
        optimizer.step()
        train_losses.append(loss.item())
    avg_train_loss = sum(train_losses) / len(train_losses)
    train_time = time.time() - t0

    # ---- Validate ----
    model.eval()
    val_losses = []
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(valid_loader, desc=f"[Epoch {epoch}] Validation"):
            batch = {k:v.to(device) for k,v in batch.items()}
            out = model(batch["input_ids"], batch["attention_mask"],
                        batch["speaker_change"], batch["utt_mask"],
                        labels=batch["labels"])
            val_losses.append(out["loss"].item())
            # beam accuracy
            bs_out = model(batch["input_ids"], batch["attention_mask"],
                           batch["speaker_change"], batch["utt_mask"], labels=None)
            preds = bs_out["pred_seq"].view(-1).cpu().tolist()
            labels = batch["labels"].view(-1).cpu().tolist()
            for p,l in zip(preds, labels):
                if l!=model.pad_label_id:
                    all_preds.append(p); all_labels.append(l)
            avg_token_loss = sum(val_losses) / len(val_losses)

    val_acc = accuracy_score(all_labels, all_preds)
    print(
      f"Epoch {epoch:02d} • "
      f"time {train_time:.1f}s • "
      f"train_loss {avg_train_loss:.4f} • "
      f"val_loss   {avg_token_loss:.4f} • "
      f"val_acc={val_acc:.4f}"
    )

    # ── 7) Save & upload this epoch’s checkpoint ─────────────────────────────────
    new_ckpt = {
      "epoch":       epoch,
      "model_state": model.state_dict(),
      "optim_state": optimizer.state_dict(),
    }
    new_fname = f"checkpoint_epoch{epoch:02d}.pt"
    local_path = os.path.join(save_dir, new_fname)
    torch.save(new_ckpt, local_path)

    upload_file(
      path_or_fileobj=local_path,
      path_in_repo=new_fname,
      repo_id=repo_id,
      repo_type="model",
      commit_message=(
        f"Ep {epoch:02d}: "
        f"tr_loss {avg_train_loss:.4f}, "
        f"val_loss {avg_token_loss:.4f}, "
        f"val_acc {val_acc:.4f},"
        f"Roberta {12 - n_layers_to_unfreeze}/12 Frozen,"
        f"train time {train_time:.1f}s,"
        f"tf: {teacher_forcing_ratio},"
        f"lr: {lr}"
      ),
      token=True
    )
    print(f"✅ Uploaded checkpoint {new_fname}")

Resuming from epoch 7


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 7] Train:   0%|          | 0/1777 [00:00<?, ?it/s]