<a href="https://colab.research.google.com/github/JieShenAI/torch/blob/main/huggingface/example/prompt/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

参考：

* https://www.kaggle.com/code/xuyouqian/prompt-pet


In [None]:
!pip install transformers

In [38]:
import tqdm

In [2]:
import torch
from transformers import AutoTokenizer, BertForMaskedLM, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

In [4]:
data_home = "drive/MyDrive/torch/example/prompt/data/"

In [5]:
config = {
    "train": data_home + "sentiment.train.data",
    "val": data_home + "sentiment.valid.data",
    "test": data_home + "sentiment.test.data",
    "device": torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu'),
    "batch": 2,
}
batch = config['batch']
device = config['device']

In [6]:
def read_file(filename):
  with open(filename,'r') as f:
    lines = f.readlines()
    labels = []
    texts = []
    for line in lines:
      line = line.strip()
      if len(line) == 0:
        continue
      labels.append(int(line[-1]))
      texts.append(line[:-1].strip())
  return texts, labels

In [7]:
tokenizer.convert_tokens_to_ids('好'), tokenizer.convert_tokens_to_ids('差')

(1962, 2345)

In [25]:
class SelfDataset(Dataset):

  def __init__(self, filename):
    self.texts, self.labels = read_file(filename)
    
  def __getitem__(self, idx):
    label = self.labels[idx]
    text = self.texts[idx][:400] # "总之，我给出[MASK]评。"，不能被截断丢弃。故采取提前截断(max_length=512)。
    if label == 1:
      true_token = tokenizer.convert_tokens_to_ids('好')
    else:
      true_token = tokenizer.convert_tokens_to_ids('差')

    mask_feature = text + "总之，我给出[MASK]评。"
    out = tokenizer(mask_feature,
                      max_length = 512,
                      padding='max_length',
                      return_tensors='pt',
                      # pin_memory=True,
                    )
    label = out.input_ids
    label = torch.where(label==tokenizer.mask_token_id, true_token, -100) # 只留下[MASK]的label，其余全部赋值为 -100
    out['labels'] = label
    return out, true_token

  def __len__(self):
    return len(self.texts)

In [26]:
train_dataset = SelfDataset(config['train'])
val_dataset = SelfDataset(config['val'])
test_dataset = SelfDataset(config['test'])

In [None]:
train_dataset[0]

In [27]:
train_dataloader = DataLoader(train_dataset, batch, True)
val_dataloader = DataLoader(val_dataset, batch, False)
test_dataloader = DataLoader(test_dataset, batch, False)

In [11]:
epoch = 5

In [12]:
model = BertForMaskedLM.from_pretrained("bert-base-chinese").to(device)
optimizer = AdamW(model.parameters(),
            lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
            eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
          )

Downloading pytorch_model.bin:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
total_steps = epoch * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                        num_warmup_steps = 0, # Default value in run_glue.py
                        num_training_steps = total_steps)

In [50]:
len(train_dataloader)

8442

In [14]:
def train():
  for e in range(1):
    # train_acc = 0
    train_loss = 0

    step = 0
    for train_data in train_dataloader:
      for k, v in train_data.items():
        train_data[k] = v.squeeze(1).to(device)

      optimizer.zero_grad()
      out = model(**train_data)
      loss = out.loss
      loss.backward()
      optimizer.step()
      scheduler.step()
      train_loss += loss.item()
      # logits = out.logits
      # print("logits.shape", logits.shape)
      # y_pred = torch.argmax(logits, dim=-1).cpu().numpy()
      step += 1

      if step % 200 == 0:
        train_loss /= 200
        print(f"step: {step} train_loss: {train_loss}")
        train_loss = 0
    break
train()

step: 200 train_loss: 0.6197824552629027
step: 400 train_loss: 0.3460462169506354
step: 600 train_loss: 0.3331507772622717
step: 800 train_loss: 0.31038539956916794
step: 1000 train_loss: 0.35886777213541793
step: 1200 train_loss: 0.2748750020949228
step: 1400 train_loss: 0.2516974240774289
step: 1600 train_loss: 0.3158522216184065
step: 1800 train_loss: 0.20871683244797168
step: 2000 train_loss: 0.2121273777998431
step: 2200 train_loss: 0.24842879755015018
step: 2400 train_loss: 0.22413170982035807
step: 2600 train_loss: 0.23992963119482738
step: 2800 train_loss: 0.17553325985230914
step: 3000 train_loss: 0.18577118164947024
step: 3200 train_loss: 0.1633494084555423
step: 3400 train_loss: 0.25744080392868
step: 3600 train_loss: 0.23135659381281584
step: 3800 train_loss: 0.24935374524793588
step: 4000 train_loss: 0.21996071830275468
step: 4200 train_loss: 0.19394611883428298
step: 4400 train_loss: 0.23327608389587112
step: 4600 train_loss: 0.23041521312872645
step: 4800 train_loss: 0.2

KeyboardInterrupt: ignored

In [15]:
model.save_pretrained(data_home + "/model/trained_masklm")

In [None]:
train_dataset[0]

In [None]:
for idx in range(10):
  tmp = train_dataset[idx]
  for k, v in tmp.items():
    tmp[k] = v.squeeze(1).to(device)
  pred = model(**tmp).logits.argmax(-1)

In [40]:
@torch.no_grad()
def evaluate():
  model.eval()
  total_num = 0
  rights = 0
  for data, label_token in val_dataloader:
    print(total_num, end=' ')
    label_token = label_token.to(device)
    for k, v in data.items():
      data[k] = v.squeeze(1).to(device)
    pred = model(**data).logits.argmax(-1)[:,0]
    rights += torch.sum(pred == label_token)
    # total_num += data.size(0)
    total_num += 2
  print(rights / total_num)
evaluate()

0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 92 94 96 98 100 102 104 106 108 110 112 114 116 118 120 122 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198 200 202 204 206 208 210 212 214 216 218 220 222 224 226 228 230 232 234 236 238 240 242 244 246 248 250 252 254 256 258 260 262 264 266 268 270 272 274 276 278 280 282 284 286 288 290 292 294 296 298 300 302 304 306 308 310 312 314 316 318 320 322 324 326 328 330 332 334 336 338 340 342 344 346 348 350 352 354 356 358 360 362 364 366 368 370 372 374 376 378 380 382 384 386 388 390 392 394 396 398 400 402 404 406 408 410 412 414 416 418 420 422 424 426 428 430 432 434 436 438 440 442 444 446 448 450 452 454 456 458 460 462 464 466 468 470 472 474 476 478 480 482 484 486 488 490 492 494 496 498 500 502 504 506 508 510 512 514 516 518 520 522 524 526

验证集的准确率

tensor(0.9257, device='cuda:0')