# A4 MEMM implement

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from utils import read_ner_data_from_connl, load_embedding_dict
from utils import get_tag_indices_from_scores
from utils import Indexer
from metrics import MetricsHandler

In [None]:
train_words, train_tags = read_ner_data_from_connl('data/train.txt')
dev_words, dev_tags = read_ner_data_from_connl('data/dev.txt')
test_words, test_tags = read_ner_data_from_connl('data/test.txt')

train_words = [word.lower() for word in train_words]
dev_words = [word.lower() for word in dev_words]
test_words = [word.lower() for word in test_words]

word_indexer = Indexer(train_words)
tag_indexer = Indexer(train_tags)

In [8]:

class NERDataset:
    def __init__(self, words, tags, word_indexer, tag_indexer):
        self.words = words
        self.tags = tags
        self.word_indexer = word_indexer
        self.tag_indexer = tag_indexer
    
    def __len__(self):
        return len(self.words)
    
    def __getitem__(self, idx):
        word_idx = self.word_indexer.element_to_index(self.words[idx])
        tag_idx = self.tag_indexer.element_to_index(self.tags[idx])
        return word_idx, tag_idx

train_dataset = NERDataset(train_words, train_tags, word_indexer, tag_indexer)
dev_dataset = NERDataset(dev_words, dev_tags, word_indexer, tag_indexer)
test_dataset = NERDataset(test_words, test_tags, word_indexer, tag_indexer)

def get_batch(dataset, batch_size):
    words = dataset.words
    tags = dataset.tags
    for i in range(0, len(dataset), batch_size):
        batch_words = words[i:i + batch_size]
        batch_tags = tags[i:i + batch_size]
        word_indices = [dataset.word_indexer.element_to_index(w) for w in batch_words]
        tag_indices = [dataset.tag_indexer.element_to_index(t) for t in batch_tags]
        yield (torch.tensor(word_indices, dtype=torch.long),
               torch.tensor(tag_indices, dtype=torch.long))

batch_size = 128
train_loader = list(get_batch(train_dataset, batch_size))
dev_loader = list(get_batch(dev_dataset, batch_size))
test_loader = list(get_batch(test_dataset, batch_size))


In [None]:
glove_path = "data/glove.6B.100d.txt"
embeddings_index = load_embedding_dict(glove_path)
embedding_dim = 100
embedding_matrix = np.zeros((len(word_indexer), embedding_dim))
for word, idx in word_indexer.get_element_to_index_dict().items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector


100%|██████████| 400000/400000 [00:02<00:00, 139510.48it/s]


In [None]:
class MEMMNER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, tagset_size, tag_embedding_dim, embedding_matrix):
        super(MEMMNER, self).__init__()

        self.word_embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_matrix), freeze=False)  

        self.tag_embedding = nn.Embedding(tagset_size, tag_embedding_dim)

        self.fc = nn.Linear(embedding_dim + tag_embedding_dim, tagset_size)
    
    def forward(self, words, prev_tags):
    
        word_emb = self.word_embedding(words)  # (batch_size, embedding_dim)
        tag_emb = self.tag_embedding(prev_tags)  # (batch_size, tag_embedding_dim)

        combined = torch.cat((word_emb, tag_emb), dim=-1)  # (batch_size, embedding_dim + tag_embedding_dim)
        tag_scores = self.fc(combined)  # (batch_size, tagset_size)
        return tag_scores


In [None]:
vocab_size = len(word_indexer)
tagset_size = len(tag_indexer)
tag_embedding_dim = 50  # 标签嵌入维度，可调整
model = MEMMNER(vocab_size, embedding_dim, tagset_size, tag_embedding_dim, embedding_matrix)


labels_str = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
labels_int = list(range(len(labels_str)))
train_metrics = MetricsHandler(labels_int)
dev_metrics = MetricsHandler(labels_int)
test_metrics = MetricsHandler(labels_int)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
def train_model(model, train_loader, dev_loader, optimizer, loss_func, train_metrics, dev_metrics, num_epochs=5, device=None, train_dataset=None, dev_dataset=None):
    model.train()
    losses = []
    dev_f1_scores = []
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        train_metrics = MetricsHandler(labels_int)  
        
        for batch_idx, batch in enumerate(train_loader):
            words, tags = batch
            words = words.to(device)
            tags = tags.to(device)
            
            prev_tags = torch.zeros(words.size(0), dtype=torch.long).to(device)
            
            optimizer.zero_grad()
            output = model(words, prev_tags)  # (batch_size, tagset_size)
            loss = loss_func(output, tags)
            loss.backward()
            optimizer.step()
            
            predictions = torch.argmax(output, dim=-1).cpu().numpy()
            train_metrics.update(predictions, tags.cpu().numpy())
            running_loss += loss.item() * words.size(0)
            
            prev_tags = torch.argmax(output, dim=-1).detach()
        
        train_metrics.collect()
        epoch_loss = running_loss / len(train_dataset)
        losses.append(epoch_loss)
        train_f1 = train_metrics.get_metrics()["F1-score"][-1]
        
        dev_loss, dev_metrics = evaluate_model(model, dev_loader, loss_func, dev_metrics, device, dev_dataset)
        dev_metrics.collect()
        dev_f1 = dev_metrics.get_metrics()["F1-score"][-1]
        dev_f1_scores.append(dev_f1)
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Train F1: {train_f1:.4f}, Dev F1: {dev_f1:.4f}")
    
    return model, train_metrics, dev_metrics, losses, dev_f1_scores


def evaluate_model(model, data_loader, loss_func, eval_metrics, device=None, dataset=None):
    model.eval()
    eval_metrics = MetricsHandler(labels_int)
    total_loss = 0.0
    with torch.no_grad():
        for batch in data_loader:
            words, tags = batch
            words = words.to(device)
            tags = tags.to(device)
            

            prev_tags = torch.zeros(words.size(0), dtype=torch.long).to(device)
            output = model(words, prev_tags)
            loss = loss_func(output, tags)
            total_loss += loss.item() * words.size(0)
            
            predictions = torch.argmax(output, dim=-1).cpu().numpy()
            eval_metrics.update(predictions, tags.cpu().numpy())
    
    eval_loss = total_loss / len(dataset)
    return eval_loss, eval_metrics


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

num_epochs = 10
print("开始训练...")

train_metrics = MetricsHandler(labels_int)
dev_metrics = MetricsHandler(labels_int)

model, train_metrics, dev_metrics, losses, dev_f1_scores = train_model(
    model, train_loader, dev_loader, optimizer, loss_func, train_metrics, dev_metrics,
    num_epochs=num_epochs, device=device, train_dataset=train_dataset, dev_dataset=dev_dataset
)


model_path = "memm_ner_model.pth"
torch.save(model.state_dict(), model_path)
print(f"模型参数已保存到 {model_path}")


print("开始测试...")
test_loss, test_metrics = evaluate_model(
    model, test_loader, loss_func, test_metrics, device=device, dataset=test_dataset
)
test_metrics.collect()
test_f1 = test_metrics.get_metrics()["F1-score"][-1]
print(f"Test F1 Score: {test_f1:.4f}")


print("\n前5个周期的开发集F-1分数：")
for i, f1 in enumerate(dev_f1_scores[:5], 1):
    print(f"Epoch {i}: Dev F1 Score: {f1:.4f}")

开始训练...


  return x[1, 1] / (x[1, 0] + x[1, 1])
  return x[1, 1]/(x[1, 1] + x[0, 1])


Epoch 1/10, Loss: 0.3933, Train F1: 0.5730, Dev F1: 0.6869
Epoch 2/10, Loss: 0.2106, Train F1: 0.7796, Dev F1: 0.7301
Epoch 3/10, Loss: 0.1628, Train F1: 0.8275, Dev F1: 0.7390
Epoch 4/10, Loss: 0.1399, Train F1: 0.8541, Dev F1: 0.7385
Epoch 5/10, Loss: 0.1281, Train F1: 0.8615, Dev F1: 0.7371
Epoch 6/10, Loss: 0.1217, Train F1: 0.8633, Dev F1: 0.7381
Epoch 7/10, Loss: 0.1180, Train F1: 0.8635, Dev F1: 0.7379
Epoch 8/10, Loss: 0.1158, Train F1: 0.8636, Dev F1: 0.7397
Epoch 9/10, Loss: 0.1143, Train F1: 0.8635, Dev F1: 0.7395
Epoch 10/10, Loss: 0.1134, Train F1: 0.8632, Dev F1: 0.7399
模型参数已保存到 memm_ner_model.pth
开始测试...
Test F1 Score: 0.6391

前5个周期的开发集F-1分数：
Epoch 1: Dev F1 Score: 0.6869
Epoch 2: Dev F1 Score: 0.7301
Epoch 3: Dev F1 Score: 0.7390
Epoch 4: Dev F1 Score: 0.7385
Epoch 5: Dev F1 Score: 0.7371
