In [31]:
# --- Before your go ----
# 1. Rename Assignment-03-###.ipynb where ### is your student ID.
# 2. The deadline of Assignment-03 is 23:59pm, 06-05-2024


# --- Explore HMM POS Taggers using Brown corpus ---
# In this assignment, you will explore three taggers for a Brown corpus.
# import your packages here
import warnings
warnings.filterwarnings('ignore')

In [32]:
# Task 1 --- Load and explore your data ---
# 1). load train/test samples from Brown corpus files, brown-train.txt, brown-test.txt.
# 2). load all 12 tags from brown-tag.txt and print it out
# 3). counting how many sentences and words in both train and test datasets.
# 4). for each tag, counting how many words in train and test. e.g, tag1: [count_tr, count_te]


def build_sent_word_dict(filename: str):
    lines = open(filename, "r").readlines()
    num_sents = 0
    num_words = 0
    sent_word_dict = []
    word_dict = []
    for line in lines:
        if line.startswith("b100-") and word_dict != []:
            num_sents += 1
            sent_word_dict.append(word_dict)
            word_dict = []
        elif len(line.split()) == 2:
            num_words += 1
            word, tag = line.split()
            word_dict.append({"word": word, "tag": tag})
    sent_word_dict.append(word_dict)
    return sent_word_dict, num_sents + 1, num_words


train_sents, train_num_sents, train_num_words = build_sent_word_dict("brown-train.txt")
test_sents, test_num_sents, test_num_words = build_sent_word_dict("brown-test.txt")
TAGS = open("brown-tag.txt", "r").readlines()
for i in range(len(TAGS)):
    TAGS[i] = TAGS[i].strip()

In [33]:
def get_tag_count(sents: list, tag: str):
    count = 0
    for sent in sents:
        for word in sent:
            if word["tag"] == tag:
                count += 1
    return count


print("Number of sentences in train:", train_num_sents)
print("Number of sentences in test:", test_num_sents)
print("Number of words in train:", train_num_words)
print("Number of words in test:", test_num_words)

major_tag = ""
major_count = 0
for tag in TAGS:
    count_tr = get_tag_count(train_sents, tag)
    count_te = get_tag_count(test_sents, tag)
    if count_tr > major_count:
        major_count = count_tr
        major_tag = tag
    print(f"Tag: {tag}   \t Count in train: {count_tr}\t Count in test: {count_te}")

Number of sentences in train: 45800
Number of sentences in test: 11540
Number of words in train: 928327
Number of words in test: 232865
Tag: .   	 Count in train: 117723	 Count in test: 29842
Tag: ADJ   	 Count in train: 66985	 Count in test: 16736
Tag: ADP   	 Count in train: 115752	 Count in test: 29014
Tag: ADV   	 Count in train: 44765	 Count in test: 11474
Tag: CONJ   	 Count in train: 30455	 Count in test: 7696
Tag: DET   	 Count in train: 109418	 Count in test: 27601
Tag: NOUN   	 Count in train: 220451	 Count in test: 55107
Tag: NUM   	 Count in train: 11921	 Count in test: 2953
Tag: PRON   	 Count in train: 39657	 Count in test: 9677
Tag: PRT   	 Count in train: 23889	 Count in test: 5940
Tag: VERB   	 Count in train: 146199	 Count in test: 36551
Tag: X   	 Count in train: 1112	 Count in test: 274


In [34]:
# Task 2 --- Method 1: Build a baseline method, namely, the most frequent tagger ---
#     If you can recall, we introduced a strong baseline method (See Dan's book in
# https://web.stanford.edu/~jurafsky/slp3/ed3book_jan72023.pdf Page 164.),
#     where we label each word by using the most frequent-used tag associated with it.
# 1). find the most frequent class label for each word in the training data.
#     For example, {tr_word_1:tag_1,tr_word_2:tag_2,...}
# 2). use your built method to predict tags for both train and test datasets.
#     You should print out two values: the accuracies of train and test samples.
#     You would expect that the accuracy on train will be > 9.0 (but never = 1.0) and higher than on test.

# Notice: since there are unkown words in test samples.
#  Following ways could handle this (choose one or create your own):
#  1). mark all words that appear only once in the data with a "UNK-x" tag
#  2). tag every out-of-vocabulary word with the majority tag among all training samples.
#  3). find more methods in https://github.com/Adamouization/POS-Tagging-and-Unknown-Words


def get_mostfreq_tag(sents: list):
    word_tag_freq = {}
    word_mostfreq_tag = {}
    for sent in sents:
        for word in sent:
            if word["word"] not in word_tag_freq:
                word_tag_freq[word["word"]] = {}
            if word["tag"] not in word_tag_freq[word["word"]]:
                word_tag_freq[word["word"]][word["tag"]] = 0
            word_tag_freq[word["word"]][word["tag"]] += 1
    for word in word_tag_freq:
        word_mostfreq_tag[word] = max(word_tag_freq[word], key=word_tag_freq[word].get)
    return word_mostfreq_tag


def predict_baseline(sents: list, word_mostfreq_tag: dict, num_words: int, major_tag: str):
    correct = 0
    for sent in sents:
        for word in sent:
            # 将未知词的tag设为major_tag
            if word["word"] not in word_mostfreq_tag:
                word_mostfreq_tag[word["word"]] = major_tag
            if word["tag"] == word_mostfreq_tag[word["word"]]:
                correct += 1
    return correct / num_words


trword_mostfreq_tag = get_mostfreq_tag(train_sents)
train_acc = predict_baseline(train_sents, trword_mostfreq_tag, train_num_words, major_tag)
test_acc = predict_baseline(test_sents, trword_mostfreq_tag, test_num_words, major_tag)
print("Accuracy on train:", train_acc)
print("Accuracy on test:", test_acc)

Accuracy on train: 0.9571961173164197
Accuracy on test: 0.945187125587787


In [39]:
# Task 3 --- Method 2: Build an HMM tagger ---
# 1) You should use nltk.tag.HiddenMarkovModelTagger to build an HMM tagger.
#    It has parameters: symbols, states, transitions, outputs, priors, transform (ignore it).
#    Specify these parameters properly. For example, you can use MLE to estimate transitions, outputs and priors.
#    That is, MLE to estimate matrix A (transition matrix), and matrix B (output probabilites) (See. Page 8.4.3)
# 2) After build your model, report both the accuracy of HMM tagger for train samples and test samples.
# 3) Compared with your baseline method, discuss that why your HMM tagger is better/worse than baseline method.

# Notice: You may also need to handle unknown words just like Task 2.

import nltk
from nltk.tag import HiddenMarkovModelTagger
from nltk.probability import ConditionalFreqDist, ConditionalProbDist, MLEProbDist

# fmt:off
def build_HMM_tagger(train_sents:list, test_sents:list, major_tag:str):
    symbols = set([word["word"] for sent in train_sents for word in sent])
    states = set([word["tag"] for sent in train_sents for word in sent])
    # 将未知词的tag设为major_tag
    for sent in test_sents:
        for word in sent:
            if word["word"] not in symbols:
                symbols.add(word["word"])
                train_sents.append([{"word": word["word"], "tag": major_tag}])
    transitions = ConditionalProbDist(
        ConditionalFreqDist(
            (tag1, tag2) 
            for sent in train_sents
            for (tag1, tag2) in nltk.bigrams([word["tag"] for word in sent])
        ),
        MLEProbDist,
    )
    outputs = ConditionalProbDist(
        ConditionalFreqDist(
            (word["tag"], word["word"])
            for sent in train_sents
            for word in sent
        ),
        MLEProbDist,
    )
    priors = MLEProbDist(
        nltk.FreqDist(
            sent[0]["tag"]
            for sent in train_sents if len(sent) > 1
        )
    )
    return nltk.tag.HiddenMarkovModelTagger(
        symbols=symbols,
        states=states,
        transitions=transitions,
        outputs=outputs,
        priors=priors
    )

train_data = []
test_data = []
for sent in train_sents:
    train_data.append([(word["word"], word["tag"]) for word in sent])
for sent in test_sents:
    test_data.append([(word["word"], word["tag"]) for word in sent])

HMM_tagger = build_HMM_tagger(train_sents[:], test_sents[:], major_tag)
train_acc = HMM_tagger.accuracy(train_data)
test_acc = HMM_tagger.accuracy(test_data)
print("Accuracy on train with HMM:", train_acc)
print("Accuracy on test with HMM:", test_acc)

Accuracy on train with HMM: 0.9756141962907466
Accuracy on test with HMM: 0.9622184527515942


In [35]:
# Task 4 --- Method 3: Fine-tuning on BERT-base model for POS-tagging ---
#
# 1) You may download a BERT model (say, you choose BERT-base cased)
#    and use tools in https://github.com/huggingface/transformers
# 2) After build your model, report both the accuracy of BERT tagger for train samples and test samples.
# 3) Compared with Method 1,2, discuss that why your BERT tagger is better/worse than these two.
#    1. 上下文理解能力更强： BERT 是基于 Transformer 架构的预训练模型，能够有效地捕捉句子中的上下文信息，从而更好地理解句子中的语义和语境。传统基线模型和 HMM 可能局限于局部特征或固定的上下文窗口，不能很好地捕捉长距离依赖关系
#    2. 端到端学习： BERT 是一个端到端的模型，可以直接在标注数据上进行端到端的监督学习，而不需要手工设计特征或规则。相比之下，传统的基线模型和 HMM 需要手动设计特征和转移概率，这通常需要领域知识和经验
#    3. 迁移学习： BERT 是在大规模无监督数据上进行预训练的，然后在特定任务上进行微调。这种迁移学习的方式可以使 BERT 在少量标注数据上也能取得很好的性能，而传统的基线模型和 HMM 在数据稀缺的情况下往往表现不佳
#    4. 处理非结构化文本能力： BERT 是为处理非结构化文本设计的，可以直接处理原始文本输入，而不需要额外的预处理或特征工程。传统的基线模型和 HMM 可能需要手动设计规则来处理非结构化文本，这增加了系统的复杂性和工程难度
#    5. 全局信息捕捉： BERT 是一个深层模型，可以利用多层的注意力机制来捕捉句子中的全局信息。相比之下，传统的基线模型和 HMM 可能只能利用局部信息，无法充分利用全局信息


import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForTokenClassification, get_linear_schedule_with_warmup, AdamW


class POSDataset(Dataset):
    def __init__(self, sents: list, TAGS: list, tokenizer: BertTokenizer, seq_len:int = 256):
        self.src = []
        self.tgt = []
        self.src_mask = []
        pad_token = tokenizer.pad_token_id
        for sent in sents:
            words = [word["word"] for word in sent]
            tags = [word["tag"] for word in sent]
            ids = tokenizer(words, is_split_into_words=True, add_special_tokens=False, return_tensors="pt")["input_ids"].squeeze(0)
            tokens = tokenizer.convert_ids_to_tokens(ids)
            words_stack = list("|".join(words))
            words_stack.append("|")

            stack_idx = 0
            tags_idx = 0
            labels = []
            for token in tokens:
                if token.startswith("##"):
                    assert token[2] == words_stack[stack_idx]
                    stack_idx += len(token) - 2
                    labels.append("X")
                else:
                    assert token[0] == words_stack[stack_idx]
                    stack_idx += len(token)
                    labels.append(tags[tags_idx])
                if words_stack[stack_idx] == "|":
                    stack_idx += 1
                    tags_idx += 1
            
            num_pad_tokens = seq_len - len(ids)
            ids = torch.cat([ids, torch.tensor([pad_token] * num_pad_tokens)])
            labels = labels + ["X"] * num_pad_tokens
            labels = torch.tensor([TAGS.index(tag) for tag in labels])
            assert len(ids) == seq_len, len(ids)
            assert len(labels) == seq_len, len(labels)
            
            self.src.append(ids)
            self.tgt.append(labels)
            self.src_mask.append(torch.ones_like(ids))

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return {"src": self.src[idx], "src_mask": self.src_mask[idx], "tgt": self.tgt[idx]}

seq_len = 256
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
train_dataset = POSDataset(train_sents, TAGS, tokenizer, seq_len)
test_dataset = POSDataset(test_sents, TAGS, tokenizer, seq_len)
train_loader = DataLoader(train_dataset, 16, shuffle=True)
test_loader = DataLoader(test_dataset, 16, shuffle=False)

In [36]:
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(TAGS)).cuda()
optimizer = AdamW(model.parameters(), lr=1e-5)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
from tqdm.notebook import tqdm
from torch.utils.tensorboard import SummaryWriter

def train_epoch(model, dataloader, optimizer, scheduler, writer:SummaryWriter):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch["src"].cuda()
        attention_mask = batch["src_mask"].cuda()
        labels = batch["tgt"].cuda()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix(loss=loss.item())
        writer.add_scalar("Loss", loss.item(), global_step=writer._n_iter)
        writer.flush()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

def evaluate(model, dataloader, writer):
    model.eval()
    total_correct = 0
    total_tokens = 0
    progress_bar = tqdm(dataloader, desc="Evaluating")
    
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch["src"].cuda()
            attention_mask = batch["src_mask"].cuda()
            labels = batch["tgt"].cuda()

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            mask = attention_mask.bool()
            correct = (predictions == labels) & mask
            total_correct += correct.sum().item()
            total_tokens += mask.sum().item()
            
            progress_bar.set_postfix(accuracy=total_correct / total_tokens)
            writer.add_scalar("Accuracy", total_correct / total_tokens, global_step=writer._n_iter)
            

    accuracy = total_correct / total_tokens
    return accuracy

In [38]:
num_epochs = 4
writer = SummaryWriter()
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, writer)
    train_accuracy = evaluate(model, train_loader, writer)
    test_accuracy = evaluate(model, test_loader, writer)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")

Training:   0%|          | 0/2863 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/2863 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/722 [00:00<?, ?it/s]

Epoch 1/4
Train Loss: 0.0191
Train Accuracy: 0.9992
Test Accuracy: 0.9990


Training:   0%|          | 0/2863 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/2863 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/722 [00:00<?, ?it/s]

Epoch 2/4
Train Loss: 0.0034
Train Accuracy: 0.9995
Test Accuracy: 0.9991


Training:   0%|          | 0/2863 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/2863 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/722 [00:00<?, ?it/s]

Epoch 3/4
Train Loss: 0.0023
Train Accuracy: 0.9996
Test Accuracy: 0.9992


Training:   0%|          | 0/2863 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/2863 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/722 [00:00<?, ?it/s]

Epoch 4/4
Train Loss: 0.0016
Train Accuracy: 0.9997
Test Accuracy: 0.9992
