# Build a Transformer from scratch

In [None]:
import os
from os import listdir

In [None]:
os.listdir()

['data.py', 'mask.py', 'utils.py', 'model.py', '__pycache__']

In [None]:
os.chdir('/content/drive/MyDrive/transformer')

In [None]:
import torch

from model import Transformer
from data import v2i, i2v, loader
from mask import mask_pad, mask_tril
from tqdm import tqdm


def predict(x):
    '''
    :param x: [1, n]
    :return: predicted sentence
    '''
    n = x.shape[1]
    mask_pad_x = mask_pad(x)
    pred = [v2i['<START>']] + [v2i['<PAD>']] * (n - 1)
    pred = torch.LongTensor(pred).unsqueeze(0)
    x = transformer.embed_x(x)
    x = transformer.encoder(x, mask_pad_x)

    for i in range(n - 1):
        y = pred
        mask_tril_y = mask_tril(y)
        y = transformer.embed_y(y)
        y = transformer.decoder(x, y, mask_pad_x, mask_tril_y)
        out = transformer.fc_out(y)
        out = out[:, i, :].argmax(dim=1).detach()
        pred[:, i + 1] = out

    return pred

In [None]:
lr = 1e-3
words_size = len(v2i)
embed_dims = 32
drop_rate = 0.1
heads = 4
sens_len = 50
epochs = 1

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
transformer = Transformer(words_size=words_size, sens_len=sens_len, heads=heads, embed_dims=embed_dims,
                          drop_rate=drop_rate)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(transformer.parameters(), lr=lr)

for epoch in range(epochs):
    pbar = tqdm(enumerate(loader), total=len(loader))
    for i, (x, y) in pbar:
        pred = transformer(x, y[:, :-1])
        pred = pred.reshape(-1, words_size)
        y=y[:, 1:].reshape(-1)
        selected = y!=v2i['<PAD>']
        pred = pred[selected]
        y = y[selected]
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        pred = pred.argmax(dim=1)
        pbar.set_description(f"epoch {epoch + 1} iter {i}: train loss {loss.item():.5f}. lr {lr:e} acc {((pred == y).sum().item())/len(pred)}")

epoch 1 iter 6249: train loss 0.07620. lr 1.000000e-03 acc 0.9819548872180451: 100%|██████████| 6250/6250 [06:56<00:00, 15.00it/s]


In [None]:
torch.save(transformer.state_dict(), 'transformer.pt')

In [None]:
transformer.load_state_dict(torch.load('transformer.pt'))

<All keys matched successfully>

In [None]:
def toStr(data):
  data = data.tolist()
  str = ''
  for idx in data:
    str += i2v[idx]
  return str

In [None]:
for i, (x, y) in enumerate(loader):
    print('x:', toStr(x[0]))
    print('y:', toStr(y[0]))
    print('pred:', toStr(predict(x[0].unsqueeze(0))[0]))
    break

x: <START>ODM5KYWBSGBRNLFTUS8DUJEHVD6SNFFHWD2VF6NY<END><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
y: <START><GGLS>YYN6FV2DWHFFNS6DVHEJUD8SUTFLNRBGSBWYK5MDO<END><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
pred: <START><GGLS>YYN6FV2DWHFFNS6DVHEJUD8SUTFLNRBGSBWYK5MDO<END>D<END>M<END><END><END>


# Pretrained Model From HuggingFace

## Environments Preparing

In [54]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [55]:
!python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
[{'label': 'POSITIVE', 'score': 0.9998704195022583}]


In [56]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## Sentiment Analysis

In [None]:
from datasets import load_dataset

class Dataset(torch.utils.data.Dataset):
  def __init__(self, split):
    self.dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, i):
    text = self.dataset[i]['text']
    label = self.dataset[i]['label']
    return text, label

# train_dataset, test_dataset = Dataset('train'), Dataset('test')

In [None]:
# len(train_dataset), train_dataset[0], len(test_dataset), test_dataset[0]

In [None]:
from transformers import AutoTokenizer, BertModel

tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')

pretrained = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext').to(device)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
tokenizer

PreTrainedTokenizerFast(name_or_path='hfl/chinese-roberta-wwm-ext', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [None]:
pretrained

In [None]:
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=500,
                                   return_tensors='pt',
                                   return_length=True)

    #input_ids:编码之后的数字
    #attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    return input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), labels.to(device)

loader = torch.utils.data.DataLoader(dataset=Dataset('train'),
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    # input_ids.to(device), attention_mask.to(device), token_type_ids.to(device)
    break

print(len(loader))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels



600


(torch.Size([16, 500]),
 torch.Size([16, 500]),
 torch.Size([16, 500]),
 tensor([0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0], device='cuda:0'))

In [None]:
for params in pretrained.parameters():
  params.requires_grad_(False)

pretrained(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).last_hidden_state.shape

torch.Size([16, 500, 768])

In [None]:
class Model(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.fc1 = torch.nn.Linear(768, 2)
    # self.fc2 = torch.nn.Linear(1024, 2)

  def forward(self, input_ids, attention_mask, token_type_ids):
    with torch.no_grad():
      out = pretrained(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

    out = self.fc1(out.last_hidden_state[:, 0])
    # out = self.fc2(out)
    out = out.softmax(dim=1)

    return out

In [None]:
model = Model().to(device)
model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).shape

torch.Size([16, 2])

In [None]:
from transformers import AdamW
lr = 3e-4
optimizer = AdamW(model.parameters(), lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()



In [None]:
from tqdm import tqdm

epochs = 10

for epoch in range(epochs):
    pbar = tqdm(enumerate(loader), total=len(loader))
    for i, (input_ids, attention_mask, token_type_ids, labels) in pbar:
        pred = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        loss = loss_fn(pred, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        pred = pred.argmax(dim=1)
        pbar.set_description(f"epoch {epoch + 1} iter {i}: train loss {loss.item():.5f}. lr {lr:e} acc {((pred == labels).sum().item())/len(pred)}")

epoch 1 iter 74: train loss 0.57248. lr 3.000000e-04 acc 0.8125: 100%|██████████| 75/75 [05:10<00:00,  4.14s/it]
epoch 2 iter 74: train loss 0.51837. lr 3.000000e-04 acc 0.90625: 100%|██████████| 75/75 [05:27<00:00,  4.37s/it]
epoch 3 iter 74: train loss 0.48793. lr 3.000000e-04 acc 0.8828125: 100%|██████████| 75/75 [05:27<00:00,  4.37s/it]
epoch 4 iter 74: train loss 0.51333. lr 3.000000e-04 acc 0.8515625: 100%|██████████| 75/75 [05:27<00:00,  4.37s/it]
epoch 5 iter 74: train loss 0.47276. lr 3.000000e-04 acc 0.90625: 100%|██████████| 75/75 [05:27<00:00,  4.37s/it]
epoch 6 iter 74: train loss 0.47329. lr 3.000000e-04 acc 0.8515625: 100%|██████████| 75/75 [05:27<00:00,  4.36s/it]
epoch 7 iter 74: train loss 0.46494. lr 3.000000e-04 acc 0.84375: 100%|██████████| 75/75 [05:27<00:00,  4.36s/it]
epoch 8 iter 74: train loss 0.46827. lr 3.000000e-04 acc 0.8828125: 100%|██████████| 75/75 [05:27<00:00,  4.37s/it]
epoch 9 iter 74: train loss 0.48457. lr 3.000000e-04 acc 0.8359375: 100%|████████

In [None]:
torch.save(model.state_dict(), 'sa.pt')

In [None]:
model.load_state_dict(torch.load('sa.pt'))

<All keys matched successfully>

In [None]:
def eval_():
  loader = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)
  for epoch in range(1):
    correct = 0
    len = 0
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
        pred = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pred = pred.argmax(dim=1)
        print(tokenizer.decode(input_ids[0]))
        print('label[0]:', labels[0].item(), 'pred[0]:', pred[0].item())
        print('acc:', ((pred == labels).sum().item())/pred.shape[0])
        correct += (pred == labels).sum().item()
        len += pred.shape[0]
        if i == 20:
          break
    print('total acc:', correct/len)

eval_()



[CLS] 整 体 感 觉 不 错 ， 一 般 用 用 的 话 完 全 足 够 了 。 装 系 统 也 不 费 劲 ， 先 进 到 bios 下 面 把 ahci 改 成 ide ， 再 设 置 光 驱 启 动 （ 这 个 跟 原 来 不 到 一 样 ， 在 boot 下 面 按 f5 或 f6 把 第 二 项 cddvdw 调 到 第 一 个 ） ， 我 用 的 番 茄 花 园 ， 直 接 就 装 上 了 。 再 用 送 的 光 盘 挨 个 装 驱 动 就 可 以 了 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

## Fill in the blank

In [None]:
from datasets import load_dataset

class Dataset(torch.utils.data.Dataset):
  def __init__(self, split):
    self.dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)
    def f(data):
      return len(data['text'])>30
    self.dataset = self.dataset.filter(f)

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, i):
    text = self.dataset[i]['text']
    return text

train_dataset, test_dataset = Dataset('train'), Dataset('test')
len(train_dataset), train_dataset[0], len(test_dataset), test_dataset[0]



  0%|          | 0/10 [00:00<?, ?ba/s]



  0%|          | 0/2 [00:00<?, ?ba/s]

(9192,
 '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
 1145,
 '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片！开始还怀疑是不是赠送的个别现象，可是后来发现每张DVD后面都有！真不知道生产商怎么想的，我想看的是猫和老鼠，不是米老鼠！如果厂家是想赠送的话，那就全套米老鼠和唐老鸭都赠送，只在每张DVD后面添加一集算什么？？简直是画蛇添足！！')

In [None]:
from transformers import AutoTokenizer, BertModel

tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')

pretrained = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext').to(device)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def collate_fn(data):
    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=data,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=50,
                                   return_tensors='pt',
                                   return_length=True)
    
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']

    #把第15个词固定替换为mask
    labels = input_ids[:, 15].reshape(-1).clone()
    input_ids[:, 15] = tokenizer.get_vocab()[tokenizer.mask_token]

    return input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), labels.to(device)

loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                     batch_size=64,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

print(len(loader))
print(tokenizer.decode(input_ids[0]))
print(tokenizer.decode(labels[0]))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels.shape

143
[CLS] 驱 动 程 序 安 装 比 较 麻 烦 ， xp 安 装 [MASK] 卡 驱 动 的 时 候 老 不 成 功 ， 后 来 发 现 是 没 有 打 hd 声 卡 的 补 丁 ， 郁 闷 了 半 天 ！ 呵 [SEP]
声


(torch.Size([64, 50]),
 torch.Size([64, 50]),
 torch.Size([64, 50]),
 torch.Size([64]))

In [None]:
for params in pretrained.parameters():
  params.requires_grad_(False)

pretrained(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).last_hidden_state.shape

torch.Size([64, 50, 768])

In [None]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768, tokenizer.vocab_size)

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        out = self.fc(out.last_hidden_state[:, 15])

        return out

In [None]:
model = Model().to(device)
model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).shape

torch.Size([64, 21128])

In [None]:
from transformers import AdamW
lr = 3e-4
optimizer = AdamW(model.parameters(), lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()



In [None]:
from tqdm import tqdm

epochs = 10

for epoch in range(epochs):
    pbar = tqdm(enumerate(loader), total=len(loader))
    for i, (input_ids, attention_mask, token_type_ids, labels) in pbar:
        pred = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        loss = loss_fn(pred, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        pred = pred.argmax(dim=1)
        pbar.set_description(f"epoch {epoch + 1} iter {i}: train loss {loss.item():.5f}. lr {lr:e} acc {((pred == labels).sum().item())/len(pred)}")

epoch 1 iter 142: train loss 5.58074. lr 3.000000e-04 acc 0.328125: 100%|██████████| 143/143 [00:31<00:00,  4.49it/s]
epoch 2 iter 142: train loss 4.03321. lr 3.000000e-04 acc 0.375: 100%|██████████| 143/143 [00:30<00:00,  4.66it/s]
epoch 3 iter 142: train loss 3.84832. lr 3.000000e-04 acc 0.328125: 100%|██████████| 143/143 [00:33<00:00,  4.27it/s]
epoch 4 iter 142: train loss 2.59680. lr 3.000000e-04 acc 0.578125: 100%|██████████| 143/143 [00:32<00:00,  4.33it/s]
epoch 5 iter 142: train loss 1.93086. lr 3.000000e-04 acc 0.6875: 100%|██████████| 143/143 [00:30<00:00,  4.69it/s]
epoch 6 iter 142: train loss 1.77666. lr 3.000000e-04 acc 0.6875: 100%|██████████| 143/143 [00:32<00:00,  4.37it/s]
epoch 7 iter 142: train loss 1.51500. lr 3.000000e-04 acc 0.71875: 100%|██████████| 143/143 [00:32<00:00,  4.40it/s]
epoch 8 iter 142: train loss 1.27297. lr 3.000000e-04 acc 0.796875: 100%|██████████| 143/143 [00:32<00:00,  4.34it/s]
epoch 9 iter 142: train loss 0.74662. lr 3.000000e-04 acc 0.8593

In [None]:
def eval_():
  loader = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)
  for epoch in range(1):
    correct = 0
    len = 0
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
        pred = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pred = pred.argmax(dim=1)
        print(tokenizer.decode(input_ids[0]))
        print('label[0]:', tokenizer.decode(labels[0]), 'pred[0]:', tokenizer.decode(pred[0]))
        print('acc:', ((pred == labels).sum().item())/pred.shape[0])
        correct += (pred == labels).sum().item()
        len += pred.shape[0]
        if i == 10:
          break
    print('total acc:', correct/len)

eval_()



[CLS] 写 的 确 实 不 好 。 没 有 文 采 。 和 外 [MASK] 也 没 有 多 大 关 系 。 真 不 知 这 本 书 怎 么 这 么 多 评 论 。 忽 悠 人 呀 。 呵 呵 。 [SEP] [PAD] [PAD] [PAD]
label[0]: 企 pred[0]: 观
acc: 0.625
[CLS] 选 择 的 事 例 太 离 奇 了 ， 夸 大 了 心 [MASK] 咨 询 的 现 实 意 义 ， 让 人 失 去 了 信 任 感 ！ 如 果 说 这 样 写 的 效 果 能 在 一 开 始 抓 住 [SEP]
label[0]: 理 pred[0]: 理
acc: 0.6875
[CLS] 感 动. 还 有 锥 心 的 悲 哀. 读 过 就 [MASK] 也 无 法 忘 怀. 很 怕 被 拍 成 影 视, 毁 掉 心 中 美 好 的 人 物 形 象. 太 感 动 [SEP] [PAD] [PAD] [PAD] [PAD]
label[0]: 再 pred[0]: 是
acc: 0.5625
[CLS] 配 置 强 大 ， 外 观 漂 亮 ， 发 货 很 快 [MASK] 第 一 天 下 单 ， 第 二 天 就 到 了 ， 爽 ！ ！ [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
label[0]: ， pred[0]: ，
acc: 0.75
[CLS] 挺 悲 哀 的, 住 的 是 双 标 间 国 庆 是 [MASK], 在 杭 州 住 了 个 锦 华 之 旅 已 经 很 挫 了, 这 位 仁 兄 有 过 之 而 无 不 及, 依 然 的 一 [SEP]
label[0]: 230 pred[0]: 了
acc: 0.625
[CLS] 首 先 许 多 人 说 交 通 不 便 ， 那 诸 位 [MASK] 是 坐 惯 地 铁 了 ， 其 实 香 港 还 是 有 很 多 地 方 并 不 在 地 铁 站 15 分 钟 的 步 行 范 围 内 [SEP]
label[0]: 真 pred[0]: 真
acc: 0.6875
[CLS] 刚 买 不 到 一 周 ， 现 在 报 

## Relation between two parts

In [None]:
from datasets import load_dataset
import random
class Dataset(torch.utils.data.Dataset):
  def __init__(self, split):
    self.dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)
    def f(data):
      return len(data['text'])>40
    self.dataset = self.dataset.filter(f)

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, i):
    text = self.dataset[i]['text']
    sens1 = text[:20]
    sens2 = text[20:40]
    label = 1

    if random.randint(0, 1) == 0:
      j = random.randint(0, len(self.dataset) - 1)
      sens2 = self.dataset[j]['text'][20:40]
      label = 0
    
    return sens1, sens2, label

In [None]:
from transformers import AutoTokenizer, BertModel

tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')

pretrained = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext').to(device)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def collate_fn(data):
    sents = [i[:2] for i in data]
    labels = [i[2] for i in data]
    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=50,
                                   return_tensors='pt',
                                   return_length=True,
                                   add_special_tokens=True)
    
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    return input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), labels.to(device)

loader = torch.utils.data.DataLoader(dataset=Dataset('train'),
                                     batch_size=64,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

print(len(loader))
print(tokenizer.decode(input_ids[0]))
print(labels)
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels.shape



125
[CLS] 有 点 重 ， 是 个 遗 憾 。 能 买 这 么 小 的 笔 记 本 ， 就 [SEP] 化 、 表 达 娱 乐 化 ， 终 究 难 成 传 世 之 作 。 当 下 中 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
tensor([0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
        0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1], device='cuda:0')


(torch.Size([64, 50]),
 torch.Size([64, 50]),
 torch.Size([64, 50]),
 torch.Size([64]))

In [None]:
for params in pretrained.parameters():
  params.requires_grad_(False)

pretrained(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).last_hidden_state.shape

torch.Size([64, 50, 768])

In [None]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        out = self.fc(out.last_hidden_state[:, 0])
        out = out.softmax(dim=1)
        return out

In [None]:
model = Model().to(device)
model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).shape

torch.Size([64, 2])

In [None]:
from transformers import AdamW
lr = 3e-4
optimizer = AdamW(model.parameters(), lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
from tqdm import tqdm

epochs = 10

for epoch in range(epochs):
    pbar = tqdm(enumerate(loader), total=len(loader))
    for i, (input_ids, attention_mask, token_type_ids, labels) in pbar:
        pred = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        loss = loss_fn(pred, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        pred = pred.argmax(dim=1)
        pbar.set_description(f"epoch {epoch + 1} iter {i}: train loss {loss.item():.5f}. lr {lr:e} acc {((pred == labels).sum().item())/len(pred)}")

epoch 1 iter 124: train loss 0.62106. lr 3.000000e-04 acc 0.75: 100%|██████████| 125/125 [00:24<00:00,  5.12it/s]
epoch 2 iter 124: train loss 0.58472. lr 3.000000e-04 acc 0.765625: 100%|██████████| 125/125 [00:24<00:00,  5.06it/s]
epoch 3 iter 124: train loss 0.59849. lr 3.000000e-04 acc 0.78125: 100%|██████████| 125/125 [00:24<00:00,  5.09it/s]
epoch 4 iter 124: train loss 0.58367. lr 3.000000e-04 acc 0.734375: 100%|██████████| 125/125 [00:24<00:00,  5.05it/s]
epoch 5 iter 124: train loss 0.56347. lr 3.000000e-04 acc 0.78125: 100%|██████████| 125/125 [00:24<00:00,  5.07it/s]
epoch 6 iter 124: train loss 0.59242. lr 3.000000e-04 acc 0.703125: 100%|██████████| 125/125 [00:24<00:00,  5.07it/s]
epoch 7 iter 124: train loss 0.55611. lr 3.000000e-04 acc 0.75: 100%|██████████| 125/125 [00:24<00:00,  5.06it/s]
epoch 8 iter 124: train loss 0.52165. lr 3.000000e-04 acc 0.828125: 100%|██████████| 125/125 [00:24<00:00,  5.06it/s]
epoch 9 iter 124: train loss 0.54852. lr 3.000000e-04 acc 0.765625

In [None]:
def eval_():
  loader = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)
  for epoch in range(1):
    correct = 0
    len = 0
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
        pred = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pred = pred.argmax(dim=1)
        print(tokenizer.decode(input_ids[0]))
        print('label[0]:', labels[0].item(), 'pred[0]:', pred[0].item())
        print('acc:', ((pred == labels).sum().item())/pred.shape[0])
        correct += (pred == labels).sum().item()
        len += pred.shape[0]
        if i == 20:
          break
    print('total acc:', correct/len)

eval_()



[CLS] 交 通 位 置 很 好, 服 务 设 施 还 可 以. 但 价 格 有 点 [SEP] 盘 上 的 右 边 shift 键 设 计 不 好 。 太 长 了 。 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
label[0]: 0 pred[0]: 0
acc: 0.875
[CLS] 08 年 8 月 19 日 我 和 先 生 、 儿 子 入 住 武 夷 茶 [SEP] 苑 大 酒 店 ， 房 间 空 调 不 足 ， 我 们 开 窗 通 风 ， （ [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
label[0]: 1 pred[0]: 0
acc: 0.75
[CLS] 这 套 书 是 买 给 儿 子 的 ， 小 家 伙 两 岁 半 不 到 ， 正 [SEP] 是 这 套 书 适 合 的 年 龄 阶 段 。 之 前 看 了 许 多 评 价 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
label[0]: 1 pred[0]: 1
acc: 0.8125
[CLS] 里 面 的 内 容 很 浅 ， 而 且 没 有 系 统 性 ， 没 有 买 和 [SEP] 地 点 好 ， 在 市 中 心 的 中 山 广 场 边 上 ， 上 哪 都 很 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
label[0]: 0 pred[0]: 0
acc: 0.8125
[CLS] 作 者 的 文 笔 还 行 ， 但 通 篇 感 觉 太 琐 碎 ， 有 点 文 [SEP] 人 的 无 病 呻 吟 。 自 由 主 义 者 。 作 者 的 品 性 不 敢 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
label[0]: 1 pred[0]: 1
acc: 0.75
[CLS] 显 示 屏 太 烂, 信 号 切 换 的 时 候 有 波 纹, 延 迟 高 [SEP] 拖 影 明 显, 触 控 板 差 到 极, 我 敢 说 99 % 的 人 [SEP] [PAD] [PA

## Named Entity Recognition

In [57]:
from datasets import load_dataset

class Dataset(torch.utils.data.Dataset):
  def __init__(self, split):
    self.dataset = load_dataset(path='peoples_daily_ner', split=split)
    def f(data):
            return len(data['tokens']) <= 512 - 2
    self.dataset = self.dataset.filter(f)

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, i):
    tokens = self.dataset[i]['tokens']
    ner_tags = self.dataset[i]['ner_tags']
    return tokens, ner_tags

In [58]:
from transformers import AutoTokenizer, BertModel

tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')

pretrained = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext').to(device)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [59]:
def collate_fn(data):
    tokens = [i[0] for i in data]
    ner_tags = [i[1] for i in data]

    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=tokens,
                                   truncation=True,
                                   padding=True,
                                   return_tensors='pt',
                                   is_split_into_words=True)

    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    # ner_tags = torch.LongTensor(ner_tags)
    lens = input_ids.shape[1]
    for i in range(len(ner_tags)):
      ner_tags[i] = [7] + ner_tags[i]
      ner_tags[i] += [7] * lens
      ner_tags[i] = ner_tags[i][:lens]
    ner_tags = torch.LongTensor(ner_tags)

    return input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), ner_tags.to(device)

loader = torch.utils.data.DataLoader(dataset=Dataset('train'),
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

for i, (input_ids, attention_mask, token_type_ids,
        ner_tags) in enumerate(loader):
    # input_ids.to(device), attention_mask.to(device), token_type_ids.to(device)
    break

print(len(loader))
input_ids.shape, attention_mask.shape, token_type_ids.shape, ner_tags.shape

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


1303


(torch.Size([16, 73]),
 torch.Size([16, 73]),
 torch.Size([16, 73]),
 torch.Size([16, 73]))

In [60]:
tokenizer.decode(input_ids[0]), ner_tags[0]

('[CLS] 这 是 3 0 年 前 北 京 知 青 郭 路 生 写 下 的 诗 句 。 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 tensor([7, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7,
         7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
         7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
         7]))

In [76]:
for params in pretrained.parameters():
  params.requires_grad_(True)

pretrained(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).last_hidden_state.shape

torch.Size([16, 73, 768])

In [77]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.pretrained = pretrained
        self.rnn = torch.nn.GRU(768, 768, batch_first=True)
        self.fc = torch.nn.Linear(768, 8)

    def forward(self, input_ids, attention_mask, token_type_ids):
        # with torch.no_grad():
        out = self.pretrained(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).last_hidden_state

        out, _ = self.rnn(out)
        out = self.fc(out)
        out = out.softmax(dim=2)
        return out

model = Model().to(device)

In [78]:
model(input_ids, attention_mask, token_type_ids).shape

torch.Size([16, 73, 8])

In [79]:
from transformers import AdamW
lr = 3e-4
optimizer = AdamW(model.parameters(), lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()



In [80]:
def remove_pads(pred, ner_tags, attention_mask):
    '''
    :param pred: [b, n, 8]
    :param ner_tags: [b, n]
    :param attention_mask: [b, n]
    :return: pred [b x n, 8], ner_tags[b x n]
    '''
    pred = pred.reshape(-1, 8)
    ner_tags = ner_tags.reshape(-1)
    select_no_pads = attention_mask.reshape(-1) == 1
    pred = pred[select_no_pads]
    ner_tags = ner_tags[select_no_pads]
    return pred, ner_tags

In [81]:
# acc1: calculate accuracy through whole sentence
# acc2: calculate accuracy include only named entity
def cal_acc(pred, ner_tags):
  acc1 = (pred==ner_tags).sum().item()/len(pred)
  pred = pred.reshape(-1)
  ner_tags = ner_tags.reshape(-1)
  select = ner_tags != 0
  pred = pred[select]
  ner_tags = ner_tags[select]
  acc2 = (pred==ner_tags).sum().item()/len(pred)
  return acc1, acc2

In [82]:
from tqdm import tqdm

epochs = 1

for epoch in range(epochs):
    pbar = tqdm(enumerate(loader), total=len(loader))
    for i, (input_ids, attention_mask, token_type_ids, ner_tags) in pbar:
        pred = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pred, ner_tags = remove_pads(pred, ner_tags, attention_mask)
        loss = loss_fn(pred, ner_tags)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        pred = pred.argmax(dim=1)
        acc1, acc2 = cal_acc(pred, ner_tags)
        pbar.set_description(f"epoch {epoch + 1} iter {i}: train loss {loss.item():.5f}. lr {lr:e} acc(w) {acc1} acc(wo) {acc2}")

epoch 1 iter 1302: train loss 1.42306. lr 3.000000e-04 acc(w) 0.8509485094850948 acc(wo) 0.22535211267605634: 100%|██████████| 1303/1303 [7:25:40<00:00, 20.52s/it]


In [83]:
def eval_():
  loader = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)
  for epoch in range(1):
    for i, (input_ids, attention_mask, token_type_ids, ner_tags) in enumerate(loader):
        pred = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pred, ner_tags = remove_pads(pred, ner_tags, attention_mask)
        pred = pred.argmax(dim=1)
        print(tokenizer.decode(input_ids[0]))
        print('acc:', cal_acc(pred, ner_tags))
        print('ner_tags[0]:', ner_tags[:(len(ner_tags)//16)])
        print('pred[0]:', pred[:(len(pred)//16)])
        if i == 10:
          break

eval_()



[CLS] 正 因 为 买 壳 是 场 外 协 议 转 让 ， 整 个 买 壳 过 程 自 始 至 终 处 在 一 个 [UNK] 黑 箱 子 [UNK] 中 ， 公 众 股 东 无 法 得 知 内 情 ， 存 在 着 内 幕 交 易 的 种 种 可 能 。 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
acc: (0.9398034398034398, 0.3950617283950617)
ner_tags[0]: tensor([7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0])
pred[0]: tensor([7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0])
[CLS] 只 有 脚 踏 实 地 ， 在 体 制 、 机 制 和 技 术 上 不 断 创 新 ， 高 技 术 企 业 更 上 一 层 楼 才 有 希 望 。 [SEP] [PAD] [PAD] 