In [1]:
!nvidia-smi

Tue Mar  7 09:49:47 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:67:03.0 Off |                  Off |
| N/A   34C    P0    27W / 250W |      0MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
!pip install transformers SentencePiece torch tqdm


Looking in indexes: https://repo.huaweicloud.com/repository/pypi/simple


In [5]:
!pip install -U scikit-learn

Looking in indexes: https://repo.huaweicloud.com/repository/pypi/simple
Collecting scikit-learn
  Downloading https://repo.huaweicloud.com/repository/pypi/packages/f0/95/0ea0a2412e33080a47ec02802210c008a7a540471581c95145f030d304b4/scikit_learn-1.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 22.2 MB/s eta 0:00:01
Collecting threadpoolctl>=2.0.0
  Downloading https://repo.huaweicloud.com/repository/pypi/packages/61/cf/6e354304bcb9c6413c4e02a747b600061c21d38ba51e7e544ac7bc66aecc/threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting scipy>=1.3.2
  Downloading https://repo.huaweicloud.com/repository/pypi/packages/56/af/6a2b90fe280e89466d84747054667f74b84a8304f75931a173090919991f/scipy-1.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.8 MB)
[K     |████████████████████████████████| 33.8 MB 54.7 MB/s eta 0:00:01
[?25hCollecting scikit-learn
  Downloading https://repo.huaweicloud.com/repository/pypi/

In [6]:
import math
from tqdm import tqdm
import numpy as np
from transformers import MT5ForConditionalGeneration, T5Tokenizer
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score

In [7]:
class SoftEmbedding(nn.Module):
    def __init__(self, 
                wte: nn.Embedding,
                n_tokens: int = 10, 
                random_range: float = 0.5,
                initialize_from_vocab: bool = True):
        #它的构造函数需要传入一个原始的transformer word embedding（wte），
        #n_tokens表示任务需要的标记数量，默认为10
        #random_range表示初始化embedding时使用的随机范围，默认为0.5，
        #initialize_from_vocab表示是否从默认的vocab中初始化embedding，默认为True
        #    这个类用来给模型附加一个用于学习的embedding
        super(SoftEmbedding, self).__init__()
        self.wte = wte
        self.n_tokens = n_tokens
        #这行代码的作用是将 initialize_embedding 方法返回的 tensor 转化为 nn.Parameter，并将其赋值给 self.learned_embedding。
        #nn.parameter.Parameter 是 nn.Parameter 的别名，用于将一个 tensor 转化为一个可优化的参数。在 PyTorch 中，需要优化的参数都需要被包装为 nn.Parameter 类型。
      #nn.parameter.Parameter 的输入可以是任意 shape 的 tensor，它会被包装为一个 nn.Parameter 对象，并保存在一个 module 中。
    #在这个 module 中，这个 nn.Parameter 对象可以像其他 PyTorch 的 tensor 一样使用，但它还可以被注册到 optimizer 中，从而被优化器更新它的值。
    #通常情况下，nn.Embedding 的嵌入向量是随机初始化的，并不是根据特定的预先训练好的词向量来初始化的。但是，也可以将预训练好的词向量加载到 nn.Embedding 中，以便更好地初始化嵌入向量。
        self.learned_embedding = nn.parameter.Parameter(self.initialize_embedding(wte,
                                                                                  n_tokens, 
                                                                                  random_range, 
                                                                                initialize_from_vocab))
 #(n_tokens, embedding_size)，其中 embedding_size 是原始 Transformer word embedding 的维度（通常是 768 或 1024 等）。
#该 tensor 的每一行表示一个 token 的 embedding。           
    def initialize_embedding(self, 
                             wte: nn.Embedding,
                             n_tokens: int = 10, 
                             random_range: float = 0.5, 
                             initialize_from_vocab: bool = True):
        """initializes learned embedding
        Args:
            same as __init__
        Returns:
            torch.float: initialized using original schemes
        """
         # 有两种初始化方式，一种是从预训练模型copy一部分token，进行训练
        # 另一种是随机生成一部分训练
        # 结果上来说区别不大
 #如果 initialize_from_vocab 为 True，则会从原始的 Transformer word embedding 中复制前 n_tokens 个 token 对应的 embedding 作为 self.learned_embedding 的初始值。
        #这里使用 .clone().detach() 的原因是为了避免共享原始 embedding 的权重，从而使得 self.learned_embedding 的值独立于原始 embedding 的值
        if initialize_from_vocab:
            return self.wte.weight[:n_tokens].clone().detach()
#如果 initialize_from_vocab 为 False，则会随机生成一个形状为 (n_tokens, embedding_size) 的 tensor 作为 self.learned_embedding 的初始值，
#其中 tensor 中每个元素的值都在 [-random_range, random_range] 的范围内随机生成。
#这种初始化方式通常用于在没有预训练模型的情况下训练自定义任务。
        return torch.FloatTensor(n_tokens, wte.weight.size(1)).uniform_(-random_range, random_range)
            
    def forward(self, tokens):
        """run forward pass
        Args:
            tokens (torch.long): input tokens before encoding
        Returns:
            torch.float: encoding of text concatenated with learned task specifc embedding
        """
#self.wte 是一个 nn.Embedding 类的实例，它接收一个大小为 (batch_size, sequence_length) 的张量作为输入，其中每个元素都是一个代表单词在词表中索引的整数。
#tokens 张量的形状为 (batch_size, sequence_length)        
        input_embedding = self.wte(tokens[:, self.n_tokens:])
        learned_embedding = self.learned_embedding.repeat(input_embedding.size(0), 1, 1)
         # 把我们新加入的固定长度的，用于代表任务的prompt embedding，和实际的embedding合并
        return torch.cat([learned_embedding, input_embedding], 1)

In [8]:
#zh-dataset-inews 是一个用于中文文本分类任务的数据集，包含了10个类别的新闻文本，每个类别有大约3000条数据。这个数据集在国内的一些NLP竞赛中被广泛使用。
!pip install zh-dataset-inews

Looking in indexes: https://repo.huaweicloud.com/repository/pypi/simple
Collecting zh-dataset-inews
  Downloading https://repo.huaweicloud.com/repository/pypi/packages/94/fc/ff3c59f09108ee6a8884cce9e1bd54ea6dda2af1b1de35cf3a11363f19b6/zh_dataset_inews-0.0.2-py3-none-any.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 69 kB/s eta 0:00:01     |█████████                       | 3.2 MB 1.6 MB/s eta 0:00:06
[?25hInstalling collected packages: zh-dataset-inews
Successfully installed zh-dataset-inews-0.0.2


In [9]:
#这行代码引入了 zh_dataset_inews 数据集中的训练集、验证集和测试集数据。
#title_train 是一个列表，包含了训练集中所有新闻的标题，label_train 是一个列表，包含了训练集中所有新闻的标签。
#title_dev 和 label_dev 分别是验证集中的新闻标题和标签，title_test 和 label_test 分别是测试集中的新闻标题和标签。
from zh_dataset_inews import title_train, label_train, title_dev, label_dev, title_test, label_test

In [10]:
def generate_data(batch_size, n_tokens, title_data, label_data):
#定义了一个labels列表，其中存储了三个Tensor，用于将标签转换为模型所需的形式。
    labels = [
        torch.tensor([[3]]),  # \x00
        torch.tensor([[4]]),  # \x01
        torch.tensor([[5]]),  # \x02
    ]

    def yield_data(x_batch, y_batch, l_batch):
# 对 x_batch 进行 padding 操作，将不同长度的文本填充到相同长度。
#x_batch 中的每个元素都是一个 token id 列表，通过 padding 后，这些列表会组成一个形状为 (batch_size, max_len) 的张量 x，
#其中 max_len 是 x_batch 中最长的 token id 列表的长度。 如果 batch_first=True，则 x 的第一个维度是 batch size，否则 x 的第二个维度是 batch size。
        x = torch.nn.utils.rnn.pad_sequence(x_batch, batch_first=True)
        y = torch.cat(y_batch, dim=0)
        m = (x > 0).to(torch.float32)
        decoder_input_ids = torch.full((x.size(0), n_tokens), 1)
        if torch.cuda.is_available():
            x = x.cuda()
            y = y.cuda()
            m = m.cuda()
            decoder_input_ids = decoder_input_ids.cuda()
        return x, y, m, decoder_input_ids, l_batch

    x_batch, y_batch, l_batch = [], [], []
    for x, y in zip(title_data, label_data):
        context = x
        inputs = tokenizer(context, return_tensors="pt")
        inputs['input_ids'] = torch.cat([torch.full((1, n_tokens), 1), inputs['input_ids']], 1)
        l_batch.append(y)
        y = labels[y]
        y = torch.cat([torch.full((1, n_tokens - 1), -100), y], 1)
        x_batch.append(inputs['input_ids'][0])
        y_batch.append(y)
        if len(x_batch) >= batch_size:
            yield yield_data(x_batch, y_batch, l_batch)
            x_batch, y_batch, l_batch = [], [], []

    if len(x_batch) > 0:
        yield yield_data(x_batch, y_batch, l_batch)
        x_batch, y_batch, l_batch = [], [], []

In [11]:
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base")
tokenizer = T5Tokenizer.from_pretrained("google/mt5-base")
n_tokens = 100
s_wte = SoftEmbedding(model.get_input_embeddings(), 
                      n_tokens=n_tokens, 
                      initialize_from_vocab=True)
model.set_input_embeddings(s_wte)
if torch.cuda.is_available():
    model = model.cuda()

Downloading (…)lve/main/config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

In [14]:
parameters = list(model.parameters())
for x in parameters[1:]:  # frozen
    x.requires_grad = False

In [15]:
parameters[0]

Parameter containing:
tensor([[ 1.7500e+00, -1.6719e+00,  2.4062e+00,  ...,  6.9580e-03,
         -9.8828e-01, -4.6875e-01],
        [ 8.5625e+00,  5.5625e+00, -1.7109e+00,  ...,  7.7812e+00,
         -5.2812e+00, -3.2188e+00],
        [ 6.8750e-01, -4.5312e-01,  5.7812e-01,  ...,  7.3828e-01,
         -3.0078e-01,  2.0312e-01],
        ...,
        [-4.9219e-01,  1.9141e-01, -4.3555e-01,  ..., -8.0469e-01,
         -4.3359e-01,  5.8594e-01],
        [ 4.9609e-01,  1.1797e+00,  3.7109e-01,  ...,  1.7090e-01,
         -2.5195e-01, -3.3789e-01],
        [ 1.6328e+00,  3.4961e-01,  3.9062e-01,  ...,  1.9336e-01,
         -7.4219e-01,  3.1836e-01]], device='cuda:0', requires_grad=True)

In [16]:
parameters[2]

Parameter containing:
tensor([[-1.3977e-02,  3.8818e-02,  5.7129e-02,  ...,  4.9316e-02,
         -8.1177e-03, -3.8147e-03],
        [ 6.3965e-02, -1.0193e-02, -2.0020e-02,  ..., -8.3618e-03,
         -1.1902e-02, -2.6978e-02],
        [-1.6357e-02, -4.4922e-02,  4.8584e-02,  ..., -1.6479e-02,
         -4.0039e-02,  6.3782e-03],
        ...,
        [ 7.7820e-03, -6.5918e-03, -3.9062e-03,  ...,  1.9165e-02,
          7.4863e-05, -2.6001e-02],
        [-1.4587e-02,  1.8433e-02, -2.6489e-02,  ..., -3.9062e-02,
         -4.0527e-02,  4.1992e-02],
        [ 7.8125e-02,  1.6602e-02,  6.4941e-02,  ...,  4.2152e-04,
          4.5166e-02, -1.1780e-02]], device='cuda:0')

In [17]:
for x, y, m, dii, true_labels in generate_data(8, n_tokens, title_train, label_train):
    assert dii.shape == y.shape
    print(y.shape)
    outputs = model(input_ids=x, labels=y, attention_mask=m, decoder_input_ids=dii)
    assert outputs['logits'].shape[:2] == y.shape
    pred_labels = outputs['logits'][:, -1, 3:6].argmax(-1).detach().cpu().numpy().tolist()
    break

torch.Size([8, 100])


In [18]:
batch_size = 16
n_epoch = 50
total_batch = math.ceil(len(title_train) / batch_size)
dev_total_batch = math.ceil(len(title_dev) / batch_size)
use_ce_loss = False
ce_loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(s_wte.parameters(), lr=0.5)

for epoch in range(n_epoch):
    print('epoch', epoch)

    all_true_labels = []
    all_pred_labels = []
    losses = []
    pbar = tqdm(enumerate(generate_data(batch_size, n_tokens, title_train, label_train)), total=total_batch)
    for i, (x, y, m, dii, true_labels) in pbar:
        all_true_labels += true_labels
        
        optimizer.zero_grad()
        outputs = model(input_ids=x, labels=y, attention_mask=m, decoder_input_ids=dii)
        pred_labels = outputs['logits'][:, -1, 3:6].argmax(-1).detach().cpu().numpy().tolist()
        all_pred_labels += pred_labels

        if use_ce_loss:
            logits = outputs['logits'][:, -1, 3:6]
            true_labels_tensor = torch.tensor(true_labels, dtype=torch.long).cuda()
            loss = ce_loss(logits, true_labels_tensor)
        else:
            loss = outputs.loss
        loss.backward()
        optimizer.step()
        loss_value = float(loss.detach().cpu().numpy().tolist()) / batch_size
        losses.append(loss_value)

        acc = accuracy_score(all_true_labels, all_pred_labels)
        pbar.set_description(f'train: loss={np.mean(losses):.4f}, acc={acc:.4f}')

    all_true_labels = []
    all_pred_labels = []
    losses = []
    with torch.no_grad():
        pbar = tqdm(enumerate(generate_data(batch_size, n_tokens, title_dev, label_dev)), total=dev_total_batch)
        for i, (x, y, m, dii, true_labels) in pbar:
            all_true_labels += true_labels
            outputs = model(input_ids=x, labels=y, attention_mask=m, decoder_input_ids=dii)
            loss = outputs.loss
            loss_value = float(loss.detach().cpu().numpy().tolist()) / batch_size
            losses.append(loss_value)
            pred_labels = outputs['logits'][:, -1, 3:6].argmax(-1).detach().cpu().numpy().tolist()
            all_pred_labels += pred_labels
            acc = accuracy_score(all_true_labels, all_pred_labels)
            pbar.set_description(f'dev: loss={np.mean(losses):.4f}, acc={acc:.4f}')

epoch 0


train: loss=0.8892, acc=0.2917: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.4703, acc=0.4214: 100%|██████████| 63/63 [00:09<00:00,  6.71it/s]


epoch 1


train: loss=0.4553, acc=0.3925: 100%|██████████| 335/335 [01:46<00:00,  3.16it/s]
dev: loss=0.4243, acc=0.4214: 100%|██████████| 63/63 [00:09<00:00,  6.65it/s]


epoch 2


train: loss=0.6164, acc=0.3567: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.4274, acc=0.4895: 100%|██████████| 63/63 [00:09<00:00,  6.72it/s]


epoch 3


train: loss=0.3784, acc=0.4075: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.2968, acc=0.3073: 100%|██████████| 63/63 [00:09<00:00,  6.74it/s]


epoch 4


train: loss=0.2987, acc=0.3074: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.2826, acc=0.4725: 100%|██████████| 63/63 [00:09<00:00,  6.71it/s]


epoch 5


train: loss=0.2108, acc=0.3671: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.1473, acc=0.4765: 100%|██████████| 63/63 [00:09<00:00,  6.70it/s]


epoch 6


train: loss=0.5205, acc=0.4007: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.3577, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.72it/s]


epoch 7


train: loss=0.3455, acc=0.4351: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.4046, acc=0.5455: 100%|██████████| 63/63 [00:09<00:00,  6.70it/s]


epoch 8


train: loss=0.2818, acc=0.4459: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.2130, acc=0.4895: 100%|██████████| 63/63 [00:09<00:00,  6.70it/s]


epoch 9


train: loss=0.1414, acc=0.4504: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0850, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.70it/s]


epoch 10


train: loss=0.0741, acc=0.4799: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.0603, acc=0.4795: 100%|██████████| 63/63 [00:09<00:00,  6.72it/s]


epoch 11


train: loss=0.0725, acc=0.4915: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0660, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.72it/s]


epoch 12


train: loss=0.0727, acc=0.4962: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.0847, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.70it/s]


epoch 13


train: loss=0.0691, acc=0.5113: 100%|██████████| 335/335 [01:46<00:00,  3.16it/s]
dev: loss=0.0858, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.70it/s]


epoch 14


train: loss=0.0694, acc=0.5173: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.0810, acc=0.5085: 100%|██████████| 63/63 [00:09<00:00,  6.71it/s]


epoch 15


train: loss=0.0659, acc=0.5453: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0769, acc=0.5255: 100%|██████████| 63/63 [00:09<00:00,  6.70it/s]


epoch 16


train: loss=0.0711, acc=0.5430: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.1116, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.69it/s]


epoch 17


train: loss=0.1799, acc=0.4387: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.0769, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.72it/s]


epoch 18


train: loss=0.0714, acc=0.4883: 100%|██████████| 335/335 [01:46<00:00,  3.16it/s]
dev: loss=0.0893, acc=0.4965: 100%|██████████| 63/63 [00:09<00:00,  6.71it/s]


epoch 19


train: loss=0.0694, acc=0.5074: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.0842, acc=0.5025: 100%|██████████| 63/63 [00:09<00:00,  6.66it/s]


epoch 20


train: loss=0.0666, acc=0.5294: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0783, acc=0.6076: 100%|██████████| 63/63 [00:09<00:00,  6.68it/s]


epoch 21


train: loss=0.0640, acc=0.5507: 100%|██████████| 335/335 [01:46<00:00,  3.16it/s]
dev: loss=0.0558, acc=0.5866: 100%|██████████| 63/63 [00:09<00:00,  6.71it/s]


epoch 22


train: loss=0.0623, acc=0.5705: 100%|██████████| 335/335 [01:46<00:00,  3.16it/s]
dev: loss=0.0573, acc=0.6226: 100%|██████████| 63/63 [00:09<00:00,  6.64it/s]


epoch 23


train: loss=0.0601, acc=0.5963: 100%|██████████| 335/335 [01:46<00:00,  3.16it/s]
dev: loss=0.0668, acc=0.6587: 100%|██████████| 63/63 [00:09<00:00,  6.72it/s]


epoch 24


train: loss=0.0597, acc=0.5996: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0550, acc=0.6166: 100%|██████████| 63/63 [00:09<00:00,  6.69it/s]


epoch 25


train: loss=0.0607, acc=0.6090: 100%|██████████| 335/335 [01:46<00:00,  3.16it/s]
dev: loss=0.0659, acc=0.6817: 100%|██████████| 63/63 [00:09<00:00,  6.67it/s]


epoch 26


train: loss=0.0601, acc=0.6176: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0692, acc=0.6807: 100%|██████████| 63/63 [00:09<00:00,  6.73it/s]


epoch 27


train: loss=0.0570, acc=0.6409: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0596, acc=0.6907: 100%|██████████| 63/63 [00:09<00:00,  6.72it/s]


epoch 28


train: loss=0.0718, acc=0.6088: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.0705, acc=0.6156: 100%|██████████| 63/63 [00:09<00:00,  6.70it/s]


epoch 29


train: loss=0.0598, acc=0.6317: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.0630, acc=0.6947: 100%|██████████| 63/63 [00:09<00:00,  6.73it/s]


epoch 30


train: loss=0.0540, acc=0.6652: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0614, acc=0.7037: 100%|██████████| 63/63 [00:09<00:00,  6.71it/s]


epoch 31


train: loss=0.0539, acc=0.6691: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0634, acc=0.6967: 100%|██████████| 63/63 [00:09<00:00,  6.69it/s]


epoch 32


train: loss=0.0522, acc=0.6723: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.0603, acc=0.7127: 100%|██████████| 63/63 [00:09<00:00,  6.69it/s]


epoch 33


train: loss=0.0501, acc=0.6896: 100%|██████████| 335/335 [01:46<00:00,  3.16it/s]
dev: loss=0.0530, acc=0.7007: 100%|██████████| 63/63 [00:09<00:00,  6.67it/s]


epoch 34


train: loss=0.0519, acc=0.6917: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0478, acc=0.7327: 100%|██████████| 63/63 [00:09<00:00,  6.63it/s]


epoch 35


train: loss=0.0516, acc=0.6913: 100%|██████████| 335/335 [01:46<00:00,  3.16it/s]
dev: loss=0.0609, acc=0.7157: 100%|██████████| 63/63 [00:09<00:00,  6.70it/s]


epoch 36


train: loss=0.6932, acc=0.3669: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.2883, acc=0.4765: 100%|██████████| 63/63 [00:09<00:00,  6.71it/s]


epoch 37


train: loss=0.1852, acc=0.4125: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0781, acc=0.4915: 100%|██████████| 63/63 [00:09<00:00,  6.67it/s]


epoch 38


train: loss=0.0802, acc=0.4642: 100%|██████████| 335/335 [01:46<00:00,  3.16it/s]
dev: loss=0.0654, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.69it/s]


epoch 39


train: loss=0.0703, acc=0.4553: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.0619, acc=0.4915: 100%|██████████| 63/63 [00:09<00:00,  6.74it/s]


epoch 40


train: loss=0.0678, acc=0.4855: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.0640, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.68it/s]


epoch 41


train: loss=0.0653, acc=0.4803: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0623, acc=0.4915: 100%|██████████| 63/63 [00:09<00:00,  6.73it/s]


epoch 42


train: loss=0.0653, acc=0.4958: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0618, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.71it/s]


epoch 43


train: loss=0.1161, acc=0.4689: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0717, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.69it/s]


epoch 44


train: loss=0.0676, acc=0.4874: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0656, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.68it/s]


epoch 45


train: loss=0.0649, acc=0.5023: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0622, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.69it/s]


epoch 46


train: loss=0.0635, acc=0.5169: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.0640, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.68it/s]


epoch 47


train: loss=0.0644, acc=0.5199: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.0653, acc=0.4955: 100%|██████████| 63/63 [00:09<00:00,  6.70it/s]


epoch 48


train: loss=0.0648, acc=0.5119: 100%|██████████| 335/335 [01:45<00:00,  3.16it/s]
dev: loss=0.0676, acc=0.4905: 100%|██████████| 63/63 [00:09<00:00,  6.68it/s]


epoch 49


train: loss=0.0680, acc=0.4988: 100%|██████████| 335/335 [01:45<00:00,  3.17it/s]
dev: loss=0.0667, acc=0.4925: 100%|██████████| 63/63 [00:09<00:00,  6.71it/s]


In [20]:
parameters2 = list(model.parameters())

In [21]:
!nvidia-smi

Tue Mar  7 12:28:36 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:67:03.0 Off |                  Off |
| N/A   37C    P0    37W / 250W |  18130MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [22]:
parameters2[0]

Parameter containing:
tensor([[-12.1830, -30.1978, -20.4185,  ..., -27.8833,  15.5076,   8.6759],
        [-18.3980,  27.7653, -24.2388,  ...,  10.0886,  15.9386,  -1.8228],
        [-15.5309,   1.9090,   4.1854,  ..., -15.4622,   7.6856,   9.5103],
        ...,
        [ 32.2267,  -8.5860, -27.1125,  ..., -16.3302, -17.3018,  -6.5299],
        [ 10.1303,   1.3383, -12.2403,  ...,  16.5293,  -9.3344,  -0.4445],
        [  7.7325,  32.6109,  -3.6557,  ...,  22.8400,   9.9415,   9.7184]],
       device='cuda:0', requires_grad=True)

In [23]:
parameters2[2]

Parameter containing:
tensor([[-1.3977e-02,  3.8818e-02,  5.7129e-02,  ...,  4.9316e-02,
         -8.1177e-03, -3.8147e-03],
        [ 6.3965e-02, -1.0193e-02, -2.0020e-02,  ..., -8.3618e-03,
         -1.1902e-02, -2.6978e-02],
        [-1.6357e-02, -4.4922e-02,  4.8584e-02,  ..., -1.6479e-02,
         -4.0039e-02,  6.3782e-03],
        ...,
        [ 7.7820e-03, -6.5918e-03, -3.9062e-03,  ...,  1.9165e-02,
          7.4863e-05, -2.6001e-02],
        [-1.4587e-02,  1.8433e-02, -2.6489e-02,  ..., -3.9062e-02,
         -4.0527e-02,  4.1992e-02],
        [ 7.8125e-02,  1.6602e-02,  6.4941e-02,  ...,  4.2152e-04,
          4.5166e-02, -1.1780e-02]], device='cuda:0')

In [24]:
def predict(text):
    inputs = tokenizer(text, return_tensors='pt')
    inputs['input_ids'] = torch.cat([torch.full((1, n_tokens), 1), inputs['input_ids']], 1)

    decoder_input_ids = torch.full((1, n_tokens), 1)
    with torch.no_grad():
        outputs = model(input_ids=inputs['input_ids'].cuda(), decoder_input_ids=decoder_input_ids.cuda())
    logits = outputs['logits'][:, -1, 3:6]
    pred = logits.argmax(-1).detach().cpu().numpy()[0]
    # print(logits)
    return pred

In [25]:
train_rets = []
for i in tqdm(range(len(title_train))):
    pred = predict(title_train[i])
    train_rets.append((label_train[i], pred, title_train[i]))

100%|██████████| 5355/5355 [04:49<00:00, 18.48it/s]


In [26]:
rets = []
for i in tqdm(range(len(title_test))):
    pred = predict(title_test[i])
    rets.append((label_test[i], pred, title_test[i]))

100%|██████████| 999/999 [00:52<00:00, 19.07it/s]


In [27]:
print(
    accuracy_score(
        [x[0] for x in train_rets],
        [x[1] for x in train_rets],
    )
)

0.5017740429505135


In [28]:
print(
    accuracy_score(
        [x[0] for x in rets],
        [x[1] for x in rets],
    )
)

0.4994994994994995


In [29]:
print(
    accuracy_score(
        [x[0] for x in rets],
        [0] * len(rets),
    ),
    accuracy_score(
        [x[0] for x in rets],
        [1] * len(rets),
    ),
    accuracy_score(
        [x[0] for x in rets],
        [2] * len(rets),
    )
)

0.0990990990990991 0.4944944944944945 0.4064064064064064
