In [1]:
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
from torcheval.metrics.functional import multiclass_f1_score
from datasets import load_dataset
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import logging
import pandas as pd

from types import SimpleNamespace
from model import TransformerClassifier
from datetime import datetime
from data import MyDataset, TextDataset
from utils import logprint, create_unique_folder, plot_and_save

## Hyperparameters

In [2]:
config = SimpleNamespace()

# transformer
config.model_type = 'bert' # [bert, roberta, xlnet]
config.freeze_layer = 8     # the first n number of transformer layers to freeze
config.freeze_embedding = True  # whether to freeze the word embedding layer in the transformer
config.freeze_pool = True       # for bert/robert only. whether to freeze the pooling layer 
config.freeze_summary = True    # for xlnet only. whether to freeze the summary layer

# cls
config.input_size = 768          # the input dimension to the classifier, equal to the output of transformer's logit
config.hidden_layers = []         # a list of int. The hidden layers of the cls, e.g. [256, 128] means two hidden layers of 256 and 128 respectively
config.num_classes = 8          # number of classes to classify

# data
config.dataset_folder = 'USs/user_stories_score_full.csv'
config.train_batch_size = 32
config.test_batch_size = 32

# train
config.epochs = 2
config.lr = 2e-5
config.weight_decay = 0.01
config.log_freq = 10        # log/print frequency
config.device = 'cuda' if torch.cuda.is_available() else 'cpu'

# output
config.output_folder = 'output/roberta'
config.output_folder = create_unique_folder(config.output_folder)

Folder created at: output/roberta_8


## Data Preparation

### 1. Raw text data

In [3]:

# 创建数据集和数据加载器
dataset = MyDataset(config.dataset_folder)

# 分出训练集和测试集
US_train, US_test, label_train, label_test = train_test_split(
    dataset.US, dataset.label, test_size=0.2, random_state=42
)

# 创建训练集和测试集的 Dataset 实例
train_dataset = MyDataset(None)
train_dataset.US = US_train
train_dataset.label = label_train

test_dataset = MyDataset(None)
test_dataset.US = US_test
test_dataset.label = label_test

# 将数据集的US处理成列表 然后进行tokenizer
train_text = train_dataset.US.tolist()
test_text = test_dataset.US.tolist()

### 2. Tokenization & Vectorization

In [4]:
# init tokenizer

if config.model_type == 'bert':
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # out: ['input_ids', 'token_type_ids', 'attention_mask']
elif config.model_type == 'roberta':
    from transformers import RobertaTokenizer
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')    # out: ['input_ids', 'attention_mask']
elif config.model_type == 'xlnet':
    from transformers import XLNetTokenizer
    tokenizer = XLNetTokenizer.from_pretrained('xlnet/xlnet-base-cased')  # out: ['input_ids', 'token_type_ids', 'attention_mask']
else:
    raise NotImplementedError

In [5]:
# tokenize features

train_features = tokenizer(text=train_text,
                           add_special_tokens=True,
                           padding='max_length',
                           truncation=True,
                           max_length=128,
                           return_tensors='pt')

test_features = tokenizer(text=test_text,
                          add_special_tokens=True,
                          padding='max_length',
                          truncation=True,
                          max_length=128,
                          return_tensors='pt')

print('input attributes', train_features.keys())
print(train_features['input_ids'].max())

input attributes dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
tensor(29589)


In [6]:
# vectorize labels

# 把训练和测试的label格式转换成tensor，因为tensor()不能接受str类型的list，所以使用OneHot转换成数字类型
labelEncoder= LabelEncoder()
train_labels = torch.tensor(labelEncoder.fit_transform(train_dataset.label.values))
test_labels = torch.tensor(labelEncoder.fit_transform(test_dataset.label.values))

print('labels shape:', train_labels.shape)

labels shape: torch.Size([1433])


### 3. Prepare Dataset and DataLoader

In [7]:
train_data = TextDataset(train_features, train_labels)
test_data = TextDataset(test_features, test_labels)

train_loader = DataLoader(train_data, batch_size=config.train_batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=config.test_batch_size, shuffle=False)

# Init models

In [8]:
# init transformer backbone and freeze weights

if config.model_type == 'bert':
    from transformers import BertModel
    from model import freeze_bert_weights
    backbone = BertModel.from_pretrained('bert-base-uncased')
    print(backbone)
    freeze_bert_weights(backbone, config.freeze_layer, config.freeze_embedding, config.freeze_pool)
elif config.model_type == 'roberta':
    from transformers import BertModel
    from model import freeze_bert_weights
    backbone = BertModel.from_pretrained('roberta-base')
    print(backbone)
    freeze_bert_weights(backbone, config.freeze_layer, config.freeze_embedding, config.freeze_pool)
elif config.model_type == 'xlnet':
    from transformers import XLNetForSequenceClassification
    from model import freeze_xlnet_weights
    backbone = XLNetForSequenceClassification.from_pretrained('xlnet/xlnet-base-cased', num_labels=config.num_classes) 
    print(backbone)
    freeze_xlnet_weights(backbone, config.freeze_layer, config.freeze_embedding, config.freeze_summary)
else:
    raise NotImplementedError

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [9]:
# init classifier
if config.model_type == 'xlnet':
    model = backbone
else:
    model = TransformerClassifier(backbone=backbone,
                                input_size=config.input_size,
                                hidden_layers=config.hidden_layers,
                                num_classes=config.num_classes)
model = model.to(config.device)

In [10]:
# init optimizer
optimizer = Adam(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)

# loss function
loss_fn = nn.CrossEntropyLoss()

# Train

In [11]:
# reset handler
for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)

logging.basicConfig(filename=os.path.join(config.output_folder, 'train_log.log'), level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s', filemode='w')

logging.info(f'configurations: {config}')
logging.info(model)

In [12]:
best_f1 = 0
train_loss_lst, train_f1_lst, train_acc_lst, test_loss_lst, test_f1_lst, test_acc_lst = [], [], [], [], [], []
print(f"开始训练: {datetime.now()} ")
for i in range(config.epochs):
    total_labels_train = []
    total_logits_train = []
    gradnorm_list = []
    for j, (X, y) in enumerate(train_loader):
        for k in X.keys():
            X[k] = X[k].to(config.device)
        labels = y.to(config.device).to(torch.int64)

        if config.model_type == 'xlnet':
            X['labels'] = labels

        # forward
        output = model(**X)

        # obtain loss and stats
        if config.model_type == 'xlnet':
            loss = output.loss
            logits = output.logits
        else:
            # for bert/roberta
            logits = output
            loss = loss_fn(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        
        gradnorm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        gradnorm_list.append(gradnorm)
        
        optimizer.step()

        with torch.no_grad():
            total_labels_train.append(labels)
            total_logits_train.append(logits)

        if j % config.log_freq == 0:
            logprint(f"epoch {i}, batch {j}/{len(train_loader)}, train loss: {loss.item()}")

    # 打印梯度范数的均值
    print(f'梯度范数', sum(gradnorm_list) / len(gradnorm_list))
    # 评估训练结果
    with torch.no_grad():
        train_logits_all = torch.concat(total_logits_train)
        train_labels_all = torch.concat(total_labels_train).to(torch.int64)
        train_loss = loss_fn(train_logits_all, train_labels_all)
        train_f1 = multiclass_f1_score(train_logits_all, train_labels_all, num_classes=8, average='macro')
        train_acc = torch.sum(train_logits_all.argmax(-1) == train_labels_all) / len(train_labels_all)
        train_loss_lst.append(train_loss.item())
        train_f1_lst.append(train_f1.item())
        train_acc_lst.append(train_acc.item())
    print(f"开始测试: {datetime.now()} ")
    with torch.no_grad():
        total_logits_test = []
        for X, y in test_loader:
            for k in X.keys():
                X[k] = X[k].to(config.device)
            labels = y.to(config.device).to(torch.int64)

            if config.model_type == 'xlnet':
                X['labels'] = labels

            output = model(**X)

            if config.model_type == 'xlnet':
                logits = output.logits
            else:
                logits = output
            total_logits_test.append(logits)

        test_logits_all = torch.concat(total_logits_test)
        test_labels_all = test_data.labels.to(config.device).to(torch.int64)

        test_loss = loss_fn(test_logits_all, test_labels_all)
        test_f1 = multiclass_f1_score(test_logits_all, test_labels_all, num_classes=8, average='macro')
        test_acc = torch.sum(test_logits_all.argmax(-1) == test_labels_all) / len(test_labels_all)
        test_loss_lst.append(test_loss.item())
        test_f1_lst.append(test_f1.item())
        test_acc_lst.append(test_acc.item())
    
    print(f"结束测试: {datetime.now()} ")
    if test_f1 > best_f1:
        best_f1 = test_f1.item()
        torch.save(model.state_dict(), os.path.join(config.output_folder, 'best_model.pt'))
        torch.save(optimizer.state_dict(), os.path.join(config.output_folder, 'optimizer.pt'))

        logprint('Saving models ...')

    logprint(
        f"epoch {i} train loss: {train_loss}, train_f1:{train_f1}, train acc: {train_acc}\n  test loss: {test_loss}, test f1: {test_f1}, test acc: {test_acc}")
    logprint(f'epoch {i} best f1: {best_f1}')
    logprint(' ')  # empty line to separate epochs

    with open(os.path.join(config.output_folder, 'results.json'), 'w') as f:
        json.dump({
            'train_loss': train_loss_lst,
            'train_f1': train_f1_lst,
            'train_acc': train_acc_lst,
            'test_loss': test_loss_lst,
            'test_f1': test_f1_lst,
            'test_acc': test_acc_lst,
        }, f)

开始训练: 2024-08-02 00:23:30.800693 
epoch 0, batch 0/45, train loss: 1.9474589824676514
epoch 0, batch 10/45, train loss: 1.9180608987808228
epoch 0, batch 20/45, train loss: 1.8207385540008545
epoch 0, batch 30/45, train loss: 1.533968210220337
epoch 0, batch 40/45, train loss: 1.5406452417373657
梯度范数 tensor(6.0650, device='cuda:0')
开始测试: 2024-08-02 00:23:39.570096 
结束测试: 2024-08-02 00:23:40.920317 
Saving models ...
epoch 0 train loss: 1.7368096113204956, train_f1:0.07860605418682098, train acc: 0.30076760053634644
  test loss: 1.6018669605255127, test f1: 0.061662860214710236, test acc: 0.2729805111885071
epoch 0 best f1: 0.061662860214710236
 
epoch 1, batch 0/45, train loss: 1.4921958446502686
epoch 1, batch 10/45, train loss: 1.3977398872375488
epoch 1, batch 20/45, train loss: 1.3478947877883911
epoch 1, batch 30/45, train loss: 1.6216673851013184
epoch 1, batch 40/45, train loss: 1.3294156789779663
梯度范数 tensor(4.0218, device='cuda:0')
开始测试: 2024-08-02 00:23:50.330058 
结束测试: 2024-

In [13]:
# 绘制 accuracy 曲线
plot_and_save(test_f1_lst, 'Test Accuracy', config.output_folder, 'test_accuracy.png')

# 绘制 test_f1_lst 曲线
plot_and_save(test_f1_lst, 'Test F1 Score', config.output_folder, 'test_f1.png')

# 绘制 test_loss_lst 曲线
plot_and_save(test_loss_lst, 'Test Loss', config.output_folder, 'test_loss.png')