## TUT1-Task 2

In [27]:
import torch
print(torch.__version__)
print(torch.backends.mps.is_available()) 

2.8.0
True


### 1. Load & split data 

In [28]:
# load data
from datasets import load_dataset           
imdb = load_dataset("imdb")                       # IMDB（train/test）
ag   = load_dataset("ag_news")                    # AG_NEWS（train/test）
# print(imdb)   
# print(ag)

In [29]:
# split data set: train, validation(10%), test
# IMDB
imdb_split = imdb["train"].train_test_split(test_size=0.1, seed=42)
imdb_train = imdb_split["train"]
imdb_val   = imdb_split["test"]   
imdb_test  = imdb["test"]

# AG_NEWS
ag_split = ag["train"].train_test_split(test_size=0.1, seed=42)
ag_train = ag_split["train"]
ag_val   = ag_split["test"]
ag_test  = ag["test"]

### 2. Vocab & Dataloader

In [30]:
# Tokenize: "I'm loving it in 2024!!!" -> ["i'm", "loving", "it", "in", "2024"]
import re

TOKEN_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)*|[0-9]+")  #t least one letter + “'” + number0-9

def tokenize(text: str, lowercase: bool = True):
    if lowercase:
        text = text.lower()       # Apple=apple
    return TOKEN_RE.findall(text)     

In [31]:
# Build Vocab
from collections import Counter
from typing import Dict, List, Tuple

PAD_TOKEN = '<pad>' 
UNK_TOKEN = '<unk>'

def build_vocab(
    dataset,
    text_key: str = "text",
    max_vocab: int = 30000,
    min_freq: int = 2,
    add_bos_eos: bool = False,
) -> Tuple[Dict[str, int], List[str]]:
    """
    从（通常是 train split 的）dataset 构建词表。
    仅使用训练集统计，避免验证/测试信息泄漏。

    参数：
        dataset   : HuggingFace Dataset（建议传 train split）
        text_key  : 文本字段名（IMDB/AG_NEWS 都是 'text'）
        max_vocab : 词表上限（含特殊符号）；控制模型尺寸与稀疏
        min_freq  : 最小词频；低于此阈值的词丢弃为 <unk>
        add_bos_eos: 是否在序列首尾加入 <bos>/<eos>（分类任务一般 False）

    返回：
        token2id  : dict，token -> id 映射（含 <pad>=0, <unk>=1, 以及可选 <bos>/<eos>）
        id2token  : list，下标即 id，元素为对应 token（便于调试/反查）
    """
    counter = Counter()  # 计数器：高效累加词频

    # 1) 统计词频（只遍历训练集，避免泄漏）
    for ex in dataset:
        tokens = tokenize(ex[text_key])
        if add_bos_eos:
            tokens = ["<bos>", *tokens, "<eos>"]
        counter.update(tokens)

    # 2) 初始化特殊符号，固定在最前面的 id（便于 padding/UNK 处理）
    specials = [PAD, UNK] + (["<bos>", "<eos>"] if add_bos_eos else [])
    token2id: Dict[str, int] = {tok: i for i, tok in enumerate(specials)}

    # 3) 按频次从高到低加入普通 token，受 min_freq / max_vocab 约束
    for tok, freq in counter.most_common():
        if freq < min_freq:
            break                 # 后续频次更低，提前结束
        if tok in token2id:
            continue              # 跳过已在 specials 的符号
        if len(token2id) >= max_vocab:
            break
        token2id[tok] = len(token2id)

    # 4) 反向表：id -> token，便于可视化/调试
    id2token: List[str] = [None] * len(token2id)
    for tok, idx in token2id.items():
        id2token[idx] = tok

    return token2id, id2token


In [34]:
# Encode function
def encode(
    text: str,
    token2id: dict,
    max_len: int = 256,
    add_bos_eos: bool = False
):
    """
    把一条文本转成 token id 序列。
    
    参数：
        text        : 输入的一条文本
        token2id    : 词表 (token -> id 映射)
        max_len     : 截断的最大长度（避免太长耗内存）
        add_bos_eos : 是否在开头/结尾加 <bos>/<eos>
    
    返回：
        List[int]   : token id 序列（长度 <= max_len）
    """
    # 1. 分词
    tokens = tokenize(text)

    # 2. 可选：加入 <bos> 和 <eos>
    if add_bos_eos:
        tokens = ["<bos>", *tokens, "<eos>"]

    # 3. 把 token 转 id，不在 vocab 的词用 <unk>
    unk_id = token2id.get("<unk>")
    ids = [token2id.get(tok, unk_id) for tok in tokens]

    # 4. 截断到 max_len
    ids = ids[:max_len]

    return ids


In [43]:
# Padding & Collate
def collate_fn(batch, token2id, max_len=256):
    """
    把一个 batch 的样本整理成可训练的张量。
    
    参数：
        batch    : List[Dict]，HuggingFace dataset 里的一批数据
        token2id : 词表
        max_len  : 截断长度
    
    返回：
        padded   : LongTensor [B, T]  (已pad的id序列)
        lengths  : LongTensor [B]     (每个序列的真实长度)
        labels   : LongTensor [B]     (分类任务的标签)
    """
    PAD_ID = token2id["<pad>"]
    unk_id = token2id["<unk>"]

    # 1. 逐条 encode
    encoded = [encode(ex["text"], token2id, max_len=max_len) for ex in batch]
    lengths = [len(seq) for seq in encoded]
    maxL = max(lengths) if lengths else 1   # 本 batch 的最大长度

    # 2. padding
    padded = [seq + [PAD_ID]*(maxL - len(seq)) for seq in encoded]

    # 3. 提取 labels
    labels = [ex["label"] for ex in batch]

    # 4. 转成 PyTorch Tensor
    padded = torch.tensor(padded, dtype=torch.long)
    lengths = torch.tensor(lengths, dtype=torch.long)
    labels = torch.tensor(labels, dtype=torch.long)

    return padded, lengths, labels

from torch.utils.data import DataLoader
from functools import partial

def make_loader(ds, token2id, batch_size=64, max_len=256, shuffle=True):
    return DataLoader(
        ds, batch_size=batch_size, shuffle=shuffle,
        collate_fn=partial(collate_fn, token2id=token2id, max_len=max_len)
    )


### 3. Model

#### RNN

In [37]:
import torch.nn as nn

class RNNClassifier(nn.Module):
    """
    一个最小可用的 RNN 文本分类器：
    Embedding -> RNN -> 取最后层隐藏状态 -> Dropout -> 全连接分类头
    """
    def __init__(
        self,
        vocab_size: int,      # 词表大小（len(token2id)）
        num_labels: int,      # 类别数（IMDB=2, AG_NEWS=4）
        pad_id: int = 0,      # <pad> 的 id（我们约定为0）
        emb_dim: int = 128,   # 词向量维度（超参）
        hidden_dim: int = 256,# RNN 隐层维度（超参）
        num_layers: int = 1,  # RNN 层数（>1可加深）
        bidirectional: bool = True,  # 是否双向RNN（一般建议True）
        dropout: float = 0.2  # Dropout 概率
    ):
        super().__init__()
        # 1) 嵌入层：把 token id 映射成向量
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=emb_dim,
            padding_idx=pad_id     # <pad>行会被固定为全0，且RNN计算时可被“忽略”
        )

        # 2) RNN 编码器：batch_first=True -> 输入/输出形状都是 [B, T, *]
        # 注意：nn.RNN 的 dropout 参数只有在 num_layers > 1 时才会生效
        self.rnn = nn.RNN(
            input_size=emb_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0.0
        )

        # 3) 分类头
        out_dim = hidden_dim * (2 if bidirectional else 1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(out_dim, num_labels)

        # 额外保存一些属性（非必须，调试友好）
        self.pad_id = pad_id
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

    def forward(self, x, lengths):
        """
        x:       LongTensor [B, T]  —— padded 的 token id 序列（来自 collate_fn）
        lengths: LongTensor [B]     —— 每个序列的真实长度（来自 collate_fn）

        返回：
        logits:  FloatTensor [B, num_labels]
        """
        # 1) id -> 向量序列
        emb = self.embedding(x)  # [B, T, E]

        # 2) 为了让 RNN 忽略 padding，使用 pack_padded_sequence
        #    enforce_sorted=False：允许 lengths 无序（我们一般不会预排序）
        packed = nn.utils.rnn.pack_padded_sequence(
            emb, lengths.cpu(), batch_first=True, enforce_sorted=False
        )

        # 3) 过 RNN
        #    输出 h_n 形状：[num_layers * num_directions, B, hidden_dim]
        packed_out, h_n = self.rnn(packed)

        # 4) 取“最后一层”的最终隐藏状态作为句向量
        #    单向：        h_n[-1]         -> [B, H]
        #    双向：拼接最后一层的正向/反向 -> [B, 2H]
        if self.bidirectional:
            # 最后一层的两个方向在 h_n 的最后两个切片
            # 形状：[num_layers*2, B, H] -> 取[-2]正向、[-1]反向，再在特征维拼接
            last = torch.cat([h_n[-2], h_n[-1]], dim=1)  # [B, 2H]
        else:
            last = h_n[-1]  # [B, H]

        # 5) Dropout + 全连接分类头（输出未归一化的 logits）
        out = self.dropout(last)      # [B, out_dim]
        logits = self.fc(out)         # [B, num_labels]
        return logits


#### LSTM


In [39]:
import torch
import torch.nn as nn

class LSTMClassifier(nn.Module):
    """
    Embedding -> LSTM -> 取最后层的正/反向最后隐藏态 -> Dropout -> 全连接
    用法/输入与 RNNClassifier 保持一致。
    """
    def __init__(
        self,
        vocab_size: int,
        num_labels: int,
        pad_id: int = 0,
        emb_dim: int = 128,
        hidden_dim: int = 256,
        num_layers: int = 1,
        bidirectional: bool = True,
        dropout: float = 0.2
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_id)
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0.0
        )
        out_dim = hidden_dim * (2 if bidirectional else 1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(out_dim, num_labels)
        self.pad_id = pad_id
        self.bidirectional = bidirectional

    def forward(self, x, lengths):
        """
        x: [B, T]  padded token ids
        lengths: [B]  每条样本的真实长度
        return: logits [B, num_labels]
        """
        emb = self.embedding(x)  # [B, T, E]

        # pack 让 LSTM 忽略 padding
        packed = nn.utils.rnn.pack_padded_sequence(
            emb, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        packed_out, (h_n, c_n) = self.lstm(packed)
        # h_n: [num_layers * num_directions, B, H]

        if self.bidirectional:
            sent_vec = torch.cat([h_n[-2], h_n[-1]], dim=1)  # [B, 2H]
        else:
            sent_vec = h_n[-1]  # [B, H]

        logits = self.fc(self.dropout(sent_vec))  # [B, num_labels]
        return logits


#### Transformer Encoder

In [57]:
import torch
import torch.nn as nn

class TransformerEncoderClassifier(nn.Module):
    def __init__(self, vocab_size, num_labels, pad_id,
                 emb_dim=128, nhead=4, num_layers=2,
                 dim_feedforward=512, dropout=0.1, max_len=512):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_id)
        self.pos_emb = nn.Embedding(max_len, emb_dim)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=emb_dim, nhead=nhead,
            dim_feedforward=dim_feedforward, dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(emb_dim, num_labels)

    def forward(self, x, lengths):
        # x: [batch, seq_len]
        batch, seq_len = x.size()
        pos = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch, seq_len)
        out = self.emb(x) + self.pos_emb(pos)  # [batch, seq_len, emb_dim]
        out = self.encoder(out)                # [batch, seq_len, emb_dim]
        out = out.mean(dim=1)                  # 平均池化得到句子向量
        return self.fc(out)


### 4. Calling functions

In [58]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
# vocab
imdb_token2id, imdb_id2token = build_vocab(imdb_train, max_vocab=30000, min_freq=2)
ag_token2id,   ag_id2token   = build_vocab(ag_train,   max_vocab=30000, min_freq=2)
PAD_IMDB = imdb_token2id["<pad>"]
PAD_AG   = ag_token2id["<pad>"]

# loader
BATCH_SIZE   = 64
MAX_LEN_IMDB = 256
MAX_LEN_AG   = 128

imdb_train_loader = make_loader(imdb_train, imdb_token2id, batch_size=BATCH_SIZE, max_len=MAX_LEN_IMDB, shuffle=True)
imdb_val_loader   = make_loader(imdb_val,   imdb_token2id, batch_size=BATCH_SIZE, max_len=MAX_LEN_IMDB, shuffle=False)
imdb_test_loader  = make_loader(imdb_test,  imdb_token2id, batch_size=BATCH_SIZE, max_len=MAX_LEN_IMDB, shuffle=False)

ag_train_loader = make_loader(ag_train, ag_token2id, batch_size=BATCH_SIZE, max_len=MAX_LEN_AG, shuffle=True)
ag_val_loader   = make_loader(ag_val,   ag_token2id, batch_size=BATCH_SIZE, max_len=MAX_LEN_AG, shuffle=False)
ag_test_loader  = make_loader(ag_test,  ag_token2id, batch_size=BATCH_SIZE, max_len=MAX_LEN_AG, shuffle=False)

Device: cpu


#### Training/validation/testing

In [59]:
import torch, torch.nn as nn
from sklearn.metrics import confusion_matrix, classification_report

EPOCHS = 3
LR_RNN_LSTM = 2e-3
LR_TRANS    = 1e-3

def train_and_validate(model, train_loader, val_loader, lr, save_name):
    model = model.to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    for ep in range(1, EPOCHS+1):
        # train
        model.train(); tl=tc=tn=0
        for x,lens,y in train_loader:
            x,lens,y = x.to(device), lens.to(device), y.to(device)
            logits = model(x,lens); loss = loss_fn(logits,y)
            opt.zero_grad(); loss.backward(); opt.step()
            bs = y.size(0); tl += loss.item()*bs; tc += (logits.argmax(1)==y).sum().item(); tn += bs

        # val
        model.eval(); vl=vc=vn=0
        with torch.no_grad():
            for x,lens,y in val_loader:
                x,lens,y = x.to(device), lens.to(device), y.to(device)
                logits = model(x,lens); loss = loss_fn(logits,y)
                bs = y.size(0); vl += loss.item()*bs; vc += (logits.argmax(1)==y).sum().item(); vn += bs

        print(f"[{model.__class__.__name__}] Epoch {ep}: train {tl/tn:.4f}/{tc/tn:.4f} | val {vl/vn:.4f}/{vc/vn:.4f}")

    torch.save(model.state_dict(), save_name)
    print("Saved:", save_name)
    return model

@torch.no_grad()
def test_and_report(model, test_loader, label_names=None):
    model.eval()
    preds, gts = [], []
    for x,lens,y in test_loader:
        x,lens = x.to(device), lens.to(device)
        pred = model(x,lens).argmax(1).cpu().tolist()
        preds.extend(pred); gts.extend(y.tolist())
    print("Confusion Matrix:\n", confusion_matrix(gts, preds))
    print("\nClassification Report:\n", classification_report(gts, preds, target_names=label_names))


#### IMDB：RNN → LSTM → Transformer

In [60]:
# IMDB + RNN
imdb_rnn = RNNClassifier(
    vocab_size=len(imdb_token2id), num_labels=2, pad_id=PAD_IMDB,
    emb_dim=128, hidden_dim=256, num_layers=1, bidirectional=True, dropout=0.2
)
imdb_rnn = train_and_validate(imdb_rnn, imdb_train_loader, imdb_val_loader, lr=LR_RNN_LSTM,
                              save_name="RNNClassifier_imdb.pth")
test_and_report(imdb_rnn, imdb_test_loader, label_names=["neg","pos"])

# IMDB + LSTM
imdb_lstm = LSTMClassifier(
    vocab_size=len(imdb_token2id), num_labels=2, pad_id=PAD_IMDB,
    emb_dim=128, hidden_dim=256, num_layers=1, bidirectional=True, dropout=0.2
)
imdb_lstm = train_and_validate(imdb_lstm, imdb_train_loader, imdb_val_loader, lr=LR_RNN_LSTM,
                               save_name="LSTMClassifier_imdb.pth")
test_and_report(imdb_lstm, imdb_test_loader, label_names=["neg","pos"])

# IMDB + Transformer(Encoder-only)
imdb_trans = TransformerEncoderClassifier(
    vocab_size=len(imdb_token2id), num_labels=2, pad_id=PAD_IMDB,
    emb_dim=128, nhead=4, num_layers=2, dim_feedforward=512, dropout=0.1, max_len=1024
)
imdb_trans = train_and_validate(imdb_trans, imdb_train_loader, imdb_val_loader, lr=LR_TRANS,
                                save_name="TransformerEncoderClassifier_imdb.pth")
test_and_report(imdb_trans, imdb_test_loader, label_names=["neg","pos"])


[RNNClassifier] Epoch 1: train 0.7239/0.5185 | val 0.7169/0.4928
[RNNClassifier] Epoch 2: train 0.7252/0.5064 | val 0.7246/0.5160
[RNNClassifier] Epoch 3: train 0.7180/0.5192 | val 0.7028/0.5348
Saved: RNNClassifier_imdb.pth
Confusion Matrix:
 [[6142 6358]
 [5789 6711]]

Classification Report:
               precision    recall  f1-score   support

         neg       0.51      0.49      0.50     12500
         pos       0.51      0.54      0.52     12500

    accuracy                           0.51     25000
   macro avg       0.51      0.51      0.51     25000
weighted avg       0.51      0.51      0.51     25000

[LSTMClassifier] Epoch 1: train 0.5998/0.6652 | val 0.5197/0.7388
[LSTMClassifier] Epoch 2: train 0.4061/0.8115 | val 0.3445/0.8548
[LSTMClassifier] Epoch 3: train 0.2377/0.9061 | val 0.3078/0.8692
Saved: LSTMClassifier_imdb.pth
Confusion Matrix:
 [[10981  1519]
 [ 2444 10056]]

Classification Report:
               precision    recall  f1-score   support

         neg      

#### AG_NEWS：RNN → LSTM → Transformer

In [61]:
label4 = ["World","Sports","Business","Sci/Tech"]

# AG + RNN
ag_rnn = RNNClassifier(
    vocab_size=len(ag_token2id), num_labels=4, pad_id=PAD_AG,
    emb_dim=128, hidden_dim=256, num_layers=1, bidirectional=True, dropout=0.2
)
ag_rnn = train_and_validate(ag_rnn, ag_train_loader, ag_val_loader, lr=LR_RNN_LSTM,
                            save_name="RNNClassifier_ag.pth")
test_and_report(ag_rnn, ag_test_loader, label_names=label4)

# AG + LSTM
ag_lstm = LSTMClassifier(
    vocab_size=len(ag_token2id), num_labels=4, pad_id=PAD_AG,
    emb_dim=128, hidden_dim=256, num_layers=1, bidirectional=True, dropout=0.2
)
ag_lstm = train_and_validate(ag_lstm, ag_train_loader, ag_val_loader, lr=LR_RNN_LSTM,
                             save_name="LSTMClassifier_ag.pth")
test_and_report(ag_lstm, ag_test_loader, label_names=label4)

# AG + Transformer(Encoder-only)
ag_trans = TransformerEncoderClassifier(
    vocab_size=len(ag_token2id), num_labels=4, pad_id=PAD_AG,
    emb_dim=128, nhead=4, num_layers=2, dim_feedforward=512, dropout=0.1, max_len=512
)
ag_trans = train_and_validate(ag_trans, ag_train_loader, ag_val_loader, lr=LR_TRANS,
                              save_name="TransformerEncoderClassifier_ag.pth")
test_and_report(ag_trans, ag_test_loader, label_names=label4)


[RNNClassifier] Epoch 1: train 0.7775/0.7011 | val 0.7952/0.6930
[RNNClassifier] Epoch 2: train 0.4563/0.8468 | val 0.3587/0.8858
[RNNClassifier] Epoch 3: train 0.5065/0.8218 | val 0.7166/0.7636
Saved: RNNClassifier_ag.pth
Confusion Matrix:
 [[1482  217  123   78]
 [ 344 1503   27   26]
 [ 175   39 1486  200]
 [ 179   75  332 1314]]

Classification Report:
               precision    recall  f1-score   support

       World       0.68      0.78      0.73      1900
      Sports       0.82      0.79      0.81      1900
    Business       0.76      0.78      0.77      1900
    Sci/Tech       0.81      0.69      0.75      1900

    accuracy                           0.76      7600
   macro avg       0.77      0.76      0.76      7600
weighted avg       0.77      0.76      0.76      7600

[LSTMClassifier] Epoch 1: train 0.4287/0.8478 | val 0.2889/0.9048
[LSTMClassifier] Epoch 2: train 0.2126/0.9289 | val 0.2697/0.9119
[LSTMClassifier] Epoch 3: train 0.1359/0.9533 | val 0.2610/0.9183
Saved: 