## CS310 Natural Language Processing
## Assignment 1. Neural Text Classification

**Total points**: 50

You should roughtly follow the structure of the notebook. Add additional cells if you feel needed. 

You can (and you should) re-use the code from Lab 2. 

Make sure your code is readable and well-structured.

### 0. Import Necessary Libraries

In [324]:
import torch
import numpy as np
import torch.nn as nn
import json
import re
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator
import time
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from collections import defaultdict
from torch.utils.data import Dataset 

import jieba

### 1. Data Processing

In [325]:
# 基本分词器
def basic_tokenizer(text):
    # 仅保留中文字符
    tokens=re.findall(r'[\u4e00-\u9fa5]',text)
    return tokens

# 改进分词器
def improved_tokenizer(text):
    reg=r'[\u4e00-\u9fa5]|[a-zA-Z]+|[0-9]+|[^\w\s]'
    tokens = re.findall(reg, text)
    return tokens

# 读取数据
def build_vocab_from_file(file_path, tokenizer):
    word_freq = defaultdict(int)
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            sentence = data['sentence']
            tokens = tokenizer(sentence)
            for token in tokens:
                word_freq[token] += 1  # 更新每个token的出现频率
    
    # 构建词汇表，初始化未知词为 <unk>
    vocab = {'<unk>': 0}
    vocab.update({token: idx + 1 for idx, (token, freq) in enumerate(word_freq.items())})
    
    return vocab

vocab = build_vocab_from_file('train.jsonl', improved_tokenizer)

In [326]:
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, vocab):
        self.tokenizer = tokenizer
        self.data = []
        self.vocab = vocab
        self._prepare_data(file_path)

    def _prepare_data(self, file_path):
        self.data = []

        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                item = json.loads(line)
                sentence = item['sentence']
        
                label = item['label'][0]  # 标签是一个长度为1的列表

                tokens = self.tokenizer(sentence)
                token_ids = list(map(lambda token: self.vocab.get(token, self.vocab['<unk>']), tokens))  # 使用 map 处理 token

                self.data.append((token_ids, label))


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# 加载数据集
train_dataset = TextDataset('train.jsonl', improved_tokenizer, vocab)
test_dataset = TextDataset('test.jsonl', improved_tokenizer, vocab)

In [327]:
# 批量处理数据
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, token_ids_list, offsets = [], [], [0]
    
    for tokens, label in batch:
        label_list.append(label)
        token_ids = torch.tensor(tokens, dtype=torch.int64)
        token_ids_list.append(token_ids)
        offsets.append(offsets[-1] + token_ids.size(0))
    
    labels = torch.tensor(label_list, dtype=torch.int64)
    token_ids = torch.cat(token_ids_list, dim=0)
    offsets = torch.tensor(offsets[:-1], dtype=torch.int64)
    return labels.to(device), token_ids.to(device), offsets.to(device)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True, collate_fn=collate_batch)


### 2. Build the Model

In [328]:
class TextClassificationModel(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        # 用EmbeddingBag层进行词嵌入
        self.embedding = torch.nn.EmbeddingBag(vocab_size, embed_dim,sparse=False)

        #用全连接层（2个隐藏层）
        #用torch.nn.Sequential 实现
        self.fc=nn.Sequential(
            nn.Linear(embed_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_class)
        )

        self.init_weights()

    def init_weights(self):
        initrange = 0.5 # 初始化权重的范围
        self.embedding.weight.data.uniform_(-initrange, initrange)
        # 初始化fc层权重和偏置
        for layer in self.fc:
            if isinstance(layer, nn.Linear):
                layer.weight.data.uniform_(-initrange, initrange)
                layer.bias.data.zero_()
                

    def forward(self, token_ids, offsets):
        embedded = self.embedding(token_ids, offsets)
        out = self.fc(embedded)
        return out
    

In [329]:
train_iter = iter(train_dataset)
num_class = len(set([label for _, label in train_iter]))
vocab_size = len(vocab)
emsize = 64  # embedding size

# 实例化模型
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

EPOCHS = 10 
LR = 5 
BATCH_SIZE = 8 

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

### 3. Train and Evaluate

In [330]:
def train(model, dataloader, optimizer, criterion, epoch: int):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 200
    start_time = time.time()

    for idx, (labels, token_ids, offsets) in enumerate(dataloader): 
        optimizer.zero_grad()
        output = model(token_ids, offsets)

        try:
            loss = criterion(output, labels)
        except Exception:
            print('Error in loss calculation')
            print('output: ', output.size())
            print('labels: ', labels.size())
            print('token_ids: ', token_ids)
            print('offsets: ', offsets)
            raise

        # Backward propagation, grad clipping, and optimization
        loss.backward()  # Backpropagate
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()  # Update weights

        # Calculate correct prediction in current batch
        pred = torch.argmax(output, dim=1, keepdim=False)  # Apply argmax on output logits
        correct = (pred == labels).sum().item()  # Calculate correct predictions
        total_acc += correct  # Accumulate correct predictions

        total_count += labels.size(0)  # Increment total count
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

@torch.no_grad()
def evaluate(model, dataloader, criterion):
    model.eval()
    total_acc, total_count = 0, 0

    preds = []  
    labelset = []  

    for idx, (labels, token_ids, offsets) in enumerate(dataloader):
        if offsets[-1] > len(token_ids):
            print(idx)
            print("Labels:", labels)
            print("Token IDs:", token_ids)
            print("Offsets:", offsets)
        output = model(token_ids, offsets)  # 前向传播，获取模型输出
        loss = criterion(output, labels)  

        predicted_labels = torch.argmax(output, dim=1)  
        correct = (predicted_labels == labels).int().sum().item()  
        total_acc += correct  
        total_count += labels.size(0)

        preds.extend(predicted_labels.cpu().numpy())
        labelset.extend(labels.cpu().numpy())

    accuracy = total_acc / total_count  
    return accuracy, preds, labelset

def calculate_metrics(all_labels, all_preds):
    all_labels = np.array(all_labels)
    all_preds = np.array(all_preds)

    TP = np.sum((all_labels == 1) & (all_preds == 1))
    FP = np.sum((all_labels == 0) & (all_preds == 1))
    FN = np.sum((all_labels == 1) & (all_preds == 0))
    TN = np.sum((all_labels == 0) & (all_preds == 0))

    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    accuracy = (TP + TN) / (TP + TN + FP + FN)

    return accuracy, precision, recall, f1

In [331]:
# 准备训练和测试数据
train_iter = iter(train_dataset)
test_iter = iter(test_dataset)
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

In [332]:
total_accu = None
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    train(model, train_dataloader, optimizer, criterion, epoch)
    accu_val,_,_ = evaluate(model, valid_dataloader, criterion)

    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val

    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

test_accuracy, test_preds, test_labels = evaluate(model, test_dataloader, criterion)


num_classes = len(set(test_labels))
accuracy,precision, recall, f1 = calculate_metrics(test_labels, test_preds)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

| epoch   1 |   200/ 1506 batches | accuracy    0.672
| epoch   1 |   400/ 1506 batches | accuracy    0.698
| epoch   1 |   600/ 1506 batches | accuracy    0.693
| epoch   1 |   800/ 1506 batches | accuracy    0.695
| epoch   1 |  1000/ 1506 batches | accuracy    0.704
| epoch   1 |  1200/ 1506 batches | accuracy    0.686
| epoch   1 |  1400/ 1506 batches | accuracy    0.676
-----------------------------------------------------------
| end of epoch   1 | time:  0.78s | valid accuracy    0.713 
-----------------------------------------------------------
| epoch   2 |   200/ 1506 batches | accuracy    0.685
| epoch   2 |   400/ 1506 batches | accuracy    0.707
| epoch   2 |   600/ 1506 batches | accuracy    0.691
| epoch   2 |   800/ 1506 batches | accuracy    0.689
| epoch   2 |  1000/ 1506 batches | accuracy    0.693
| epoch   2 |  1200/ 1506 batches | accuracy    0.679
| epoch   2 |  1400/ 1506 batches | accuracy    0.691
-----------------------------------------------------------
| e

### 4. Explore Word Segmentation

In [333]:
# 使用jieba分词器
def jieba_tokenizer(text):
    return jieba.lcut(text)

vocab = build_vocab_from_file('train.jsonl', jieba_tokenizer)
# print('Vocabulary size:', len(vocab))

train_dataset = TextDataset('train.jsonl', jieba_tokenizer, vocab)
test_dataset = TextDataset('test.jsonl', jieba_tokenizer, vocab)

train_loader=DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_batch)
test_loader=DataLoader(test_dataset, batch_size=8, shuffle=True, collate_fn=collate_batch)

model=TextClassificationModel(len(vocab), embed_dim=64, num_class=2).to(device)


EPOCHS = 10
total_accu = None
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    train(model, train_dataloader, optimizer, criterion, epoch)
    accu_val,_,_ = evaluate(model, valid_dataloader, criterion)

    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val

    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

test_accuracy, test_preds, test_labels = evaluate(model, test_dataloader, criterion)

num_classes = len(set(test_labels))
accuracy,precision, recall, f1 = calculate_metrics(test_labels, test_preds)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

| epoch   1 |   200/ 1506 batches | accuracy    0.675
| epoch   1 |   400/ 1506 batches | accuracy    0.672
| epoch   1 |   600/ 1506 batches | accuracy    0.675
| epoch   1 |   800/ 1506 batches | accuracy    0.682
| epoch   1 |  1000/ 1506 batches | accuracy    0.681
| epoch   1 |  1200/ 1506 batches | accuracy    0.689
| epoch   1 |  1400/ 1506 batches | accuracy    0.706
-----------------------------------------------------------
| end of epoch   1 | time:  1.39s | valid accuracy    0.711 
-----------------------------------------------------------
| epoch   2 |   200/ 1506 batches | accuracy    0.688
| epoch   2 |   400/ 1506 batches | accuracy    0.671
| epoch   2 |   600/ 1506 batches | accuracy    0.688
| epoch   2 |   800/ 1506 batches | accuracy    0.685
| epoch   2 |  1000/ 1506 batches | accuracy    0.689
| epoch   2 |  1200/ 1506 batches | accuracy    0.683
| epoch   2 |  1400/ 1506 batches | accuracy    0.686
-----------------------------------------------------------
| e