In [1]:
from pytorch_pretrained_bert import BertModel, BertTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.nn as nn
import torch
import torch.optim as optim
import os
from torchtext.datasets import AG_NEWS
import random

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128
# PATH = os.path.join(os.getcwd(), "models", "bert-base-uncased")
# PATH = os.path.join(os.getcwd(), "models", "bert_uncased_L-2_H-128_A-2")
PATH = os.path.join(os.getcwd(), "models", "bert_uncased_L-8_H-512_A-8")
VOCAB_PATH = os.path.join(PATH, "vocab.txt")
WEIGHTS_PATH = os.path.join(PATH, "pytorch_model.bin")
# 加载bert的分词器
TOKENIZER = BertTokenizer.from_pretrained(VOCAB_PATH)
# 加载bert模型，这个路径文件夹下有bert_config.json配置文件和model.bin模型权重文件
BERT = BertModel.from_pretrained(PATH)

# s = "I'm not sure, this can work, lol -.-"

# tokens = TOKENIZER.tokenize(s)
# print("\\".join(tokens))
# # "i\\'\\m\\not\\sure\\,\\this\\can\\work\\,\\lo\\##l\\-\\.\\-"
# # 是否需要这样做？
# # tokens = ["[CLS]"] + tokens + ["[SEP]"]

# ids = torch.tensor([TOKENIZER.convert_tokens_to_ids(tokens)])
# print(ids.shape)
# # torch.Size([1, 15])
# print(ids)

# result = BERT(ids)
# print(len(result))
# # torch.Size([1, 15, 768])


## Prepare for the data

In [2]:
train_iter = AG_NEWS(split='train')
test_iter = AG_NEWS(split='test')

train_data = []
test_data = []
# convert 1-4 to 0-3
for (label, line) in train_iter:
    train_data.append([line, label-1])
for (label, line) in test_iter:
    test_data.append([line, label-1])

random.shuffle(train_data)
random.shuffle(test_data)

print(len(train_data))
print(len(test_data))


class AGNewsDataset(Dataset):
    def __init__(self, data, max_sequence_length=128):
        self.data = data
        self.max_sequence_length = max_sequence_length

    def __getitem__(self, index):
        text, label = self.data[index]
        tokens = TOKENIZER.tokenize(text)[:self.max_sequence_length-2]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        input_ids = TOKENIZER.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)

        # 将序列填充到max_sequence_length长度，并创建相应的attention_mask
        padding_length = self.max_sequence_length - len(input_ids)
        input_ids += [0] * padding_length
        attention_mask += [0] * padding_length

        return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(label)


    def __len__(self):
        return len(self.data)

train_dataset = AGNewsDataset(train_data, max_sequence_length=128)
test_dataset = AGNewsDataset(test_data, max_sequence_length=128)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(len(train_dataloader))
print(len(test_dataloader))


120000
7600
938
60


## Define the model

In [3]:
class BERTCLasifier(nn.Module):
    def __init__(self, bert, num_classes=4):
        super(BERTCLasifier, self).__init__()
        self.bert = bert
        for param in self.bert.parameters():
            param.requires_grad = False  # 冻结BERT模型的权重
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        _, pooled_output = self.bert(input_ids, attention_mask, token_type_ids)
        pooled_output = self.dropout(pooled_output)
        return  self.fc(pooled_output)
    

model = BERTCLasifier(BERT, num_classes=4)

def print_model_summary(model):
    print(model)
    print("Model Summary:")
    total_params = 0
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(f"Layer: {name} | Size: {param.size()} | Parameters: {param.numel()}")
            total_params += param.numel()
    print(f"Total Trainable Parameters: {total_params}")

print_model_summary(model)

BERTCLasifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-7): 8 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )

## Train the model

### Load model

In [4]:
import re


model = BERTCLasifier(BERT, num_classes=4)
MODEL_NAME = "BERT_medium_"
RE_MODEL = re.compile(r"BERT_medium_(\d+).pth")


def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved at {path}")

def load_model(model, path):
    model.load_state_dict(torch.load(path))
    print(f"Model loaded from {path}")

def check_model(model, path):
    if os.path.exists(path):
        load_model(model, path)
        return True
    return False

def get_latest_model(path):
    Latest_Model = RE_MODEL
    files = os.listdir(path)
    latest_model = 0
    for file in files:
        match = Latest_Model.match(file)
        if match:
            epoch = int(match.group(1))
            if epoch > latest_model:
                latest_model = epoch
    return latest_model

latest_model_epoch = get_latest_model(os.getcwd())
MODEL_PATH = os.path.join(os.getcwd(), MODEL_NAME + str(latest_model_epoch) + ".pth")


if check_model(model, MODEL_PATH):
    print("Model found, loading mode:", latest_model_epoch)
    print_model_summary(model)
else:
    print("No model found, training new model...")


No model found, training new model...


### Train model

In [5]:
def train(model, device, train_loader, optimizer, criterion, epoch, print_every=10, writer=None):
    model.train()
    total_loss = 0
    total_correct = 0
    for batch_idx, (input_ids, attention_mask, label) in enumerate(train_loader):
        input_ids, attention_mask, label = input_ids.to(device), attention_mask.to(device), label.to(device)
        optimizer.zero_grad()
        output = model(input_ids, attention_mask)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        _, predicted = torch.max(output, 1)
        total_correct += (predicted == label).sum().item()

        if (batch_idx+1) % (len(train_loader)//print_every) == 0:
            print(f"Train Epoch: {epoch} | {batch_idx * BATCH_SIZE} / {len(train_loader.dataset)} [{(100 * batch_idx/len(train_loader)):.0f} %] | Loss: {total_loss/(batch_idx+1):.3f} | Accuracy: {total_correct/(BATCH_SIZE*(batch_idx+1)):.3f}")

    if writer:
        writer.add_scalar("Loss/train", total_loss/len(train_loader), epoch)
        writer.add_scalar("Accuracy/train", total_correct/len(train_loader), epoch)

    print(f"Train Epoch: {epoch} | Loss: {total_loss/len(train_loader):.3f} | Accuracy: {total_correct/len(train_loader):.3f}")


def test(model, device, test_loader, criterion, epoch, writer=None):
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for input_ids, attention_mask, label in test_loader:
            input_ids, attention_mask, label = input_ids.to(device), attention_mask.to(device), label.to(device)
            output = model(input_ids, attention_mask)
            loss = criterion(output, label)
            total_loss += loss.item()
            _, predicted = torch.max(output, 1)
            total_correct += (predicted == label).sum().item()

    if writer:
        writer.add_scalar("Loss/test", total_loss/len(test_loader), epoch)
        writer.add_scalar("Accuracy/test", total_correct/len(test_loader), epoch)

    print(f"Test Epoch: {epoch} | Loss: {total_loss/len(test_loader):.3f} | Accuracy: {total_correct/len(test_loader):.3f}")
        


In [6]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter("runs/BERT/medium/")

LEARNING_RATE = 3e-5
EPOCHS = 10


optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

model.to(DEVICE)


for epoch in range(1, EPOCHS+1):
    if latest_model_epoch:
        epoch += latest_model_epoch + 1
        latest_model_epoch = None

    train(model, DEVICE, train_dataloader, optimizer, criterion, epoch, writer=writer)
    test(model, DEVICE, test_dataloader, criterion, epoch, writer=writer)
    if epoch % 2 == 0:
        save_model(model, os.path.join(os.getcwd(), "BERT_medium_" + str(epoch) + ".pth"))
writer.close()

save_model(model, os.path.join(os.getcwd(), "BERT_medium_" + str(epoch) + ".pth"))


Train Epoch: 0 | 11776 / 120000 [10 %] | Loss: 1.386 | Accuracy: 0.272
Train Epoch: 0 | 23680 / 120000 [20 %] | Loss: 1.374 | Accuracy: 0.300
Train Epoch: 0 | 35584 / 120000 [30 %] | Loss: 1.363 | Accuracy: 0.324
Train Epoch: 0 | 47488 / 120000 [40 %] | Loss: 1.353 | Accuracy: 0.346
Train Epoch: 0 | 59392 / 120000 [49 %] | Loss: 1.344 | Accuracy: 0.364
Train Epoch: 0 | 71296 / 120000 [59 %] | Loss: 1.335 | Accuracy: 0.381
Train Epoch: 0 | 83200 / 120000 [69 %] | Loss: 1.327 | Accuracy: 0.396
Train Epoch: 0 | 95104 / 120000 [79 %] | Loss: 1.319 | Accuracy: 0.409
Train Epoch: 0 | 107008 / 120000 [89 %] | Loss: 1.311 | Accuracy: 0.422
Train Epoch: 0 | 118912 / 120000 [99 %] | Loss: 1.304 | Accuracy: 0.434
Epoch: 0 | Loss: 1.303 | Accuracy: 55.650
Test Epoch: 0 | Loss: 1.209 | Accuracy: 79.217
Model saved at e:\VSCODE\Python\DL-Hw\BERT_medium_0.pth
Train Epoch: 1 | 11776 / 120000 [10 %] | Loss: 1.225 | Accuracy: 0.554
Train Epoch: 1 | 23680 / 120000 [20 %] | Loss: 1.218 | Accuracy: 0.563
T

KeyboardInterrupt: 