In [1]:
!pip install transformers
!pip install datasets
!pip install accelerate

In [2]:
import logging
import math
import os
import random

import datasets
import torch
from datasets import load_dataset
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm

import transformers
from accelerate import Accelerator
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AdamW,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    SchedulerType,
    get_scheduler,
    set_seed,
)


In [3]:
datasets = load_dataset('text', data_files={'train': './data/THUCNewsChinese.txt',
                                      'validation': './data/THUCNewsChinese.txt'})
                                      
datasets["train"][10]

Using custom data configuration default-2c4186e5d3e59f96
Reusing dataset text (C:\Users\Nan\.cache\huggingface\datasets\text\default-2c4186e5d3e59f96\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
100%|██████████| 2/2 [00:00<00:00, 23.26it/s]


{'text': '上海2010上半年四六级考试报名4月8日前完成\t教育'}

In [4]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [5]:
show_random_elements(datasets["train"])

Unnamed: 0,text
0,印度加息引发忧虑 日胶沪胶携手暴跌\t财经
1,纽约金价小幅回调 退守1010关口\t财经
2,分析称新闻集团可能收购Twitter\t科技
3,宏天威科技：电子商务就像空气和水\t科技
4,高考应试实用宝典：三大阶段不同学习方法\t教育
5,活塞主帅详解最后一投关键 老将：防守姚明太困难\t体育
6,选择红玫瑰还是白玫瑰--考研三大焦点探讨\t教育
7,亚奥绿城北京诚园85平精装两居3月28日开盘(图)\t房产
8,瑞信上调KB Home评级至跑赢大盘\t股票
9,谢安琪樊少皇自豪担任世界最大国旗持旗手(图)\t娱乐


In [6]:
model_checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"])
tokenize_function(datasets["train"][0:2])

{'input_ids': [[0, 47643, 47658, 12736, 49583, 49156, 48823, 18164, 41907, 27, 7258, 43251, 4394, 15113, 44636, 11582, 36714, 6248, 3602, 42393, 15389, 9264, 37127, 11582, 5543, 46890, 5782, 134, 46015, 9085, 46015, 15113, 37127, 13859, 3726, 47873, 18400, 48998, 50117, 48607, 27, 36484, 9264, 14292, 2], [0, 46015, 10470, 49429, 46890, 18400, 36714, 10809, 3602, 48956, 27, 48620, 14285, 47504, 12736, 47994, 8384, 47994, 8384, 48145, 18400, 48662, 4726, 43251, 4394, 15113, 42393, 10172, 15113, 46015, 10278, 36714, 10809, 3602, 48956, 27, 36714, 15375, 19002, 48956, 4333, 48334, 10172, 47983, 15113, 47842, 3602, 41907, 10659, 15389, 50117, 36714, 6248, 3602, 37127, 27969, 7471, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [8]:
tokenized_datasets = datasets.map(tokenize_function, remove_columns=["text"])

Loading cached processed dataset at C:\Users\Nan\.cache\huggingface\datasets\text\default-2c4186e5d3e59f96\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5\cache-432417ac0d854f96.arrow
Loading cached processed dataset at C:\Users\Nan\.cache\huggingface\datasets\text\default-2c4186e5d3e59f96\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5\cache-47490aba4b15440d.arrow


In [9]:
block_size = 128
def group_texts(examples):
    block_size = 128
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [10]:
preprocessing_num_workers = 4

tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=preprocessing_num_workers,
)

Loading cached processed dataset at C:\Users\Nan\.cache\huggingface\datasets\text\default-2c4186e5d3e59f96\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5\cache-66202a1c38f70b70.arrow
Loading cached processed dataset at C:\Users\Nan\.cache\huggingface\datasets\text\default-2c4186e5d3e59f96\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5\cache-f84c255c4f2fcec3.arrow
Loading cached processed dataset at C:\Users\Nan\.cache\huggingface\datasets\text\default-2c4186e5d3e59f96\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5\cache-f416a373cacf0953.arrow
Loading cached processed dataset at C:\Users\Nan\.cache\huggingface\datasets\text\default-2c4186e5d3e59f96\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5\cache-311ed957ef9dc0fc.arrow
Loading cached processed dataset at C:\Users\Nan\.cache\huggingface\datasets\text\default-2c4186e5d3e59f96\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf2

In [11]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]
train_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 73594
})

In [12]:
from transformers import DataCollatorForLanguageModeling

num_train_epochs = 3
gradient_accumulation_steps = 1
per_device_train_batch_size = 8
per_device_eval_batch_size = 8
gradient_accumulation_steps = 1
learning_rate = 5e-5


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=per_device_train_batch_size)
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=per_device_eval_batch_size)

In [13]:
accelerator = Accelerator()
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
total_batch_size = per_device_train_batch_size * accelerator.num_processes * gradient_accumulation_steps

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
      model, optimizer, train_dataloader, eval_dataloader)



# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
max_train_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=max_train_steps,
)

# Train!
total_batch_size = per_device_train_batch_size * accelerator.num_processes * gradient_accumulation_steps


# Only show the progress bar once on each machine.
progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps = 0
for epoch in range(num_train_epochs):
  model.train()
  for step, batch in enumerate(train_dataloader):
      outputs = model(**batch)
      loss = outputs.loss
      loss = loss / gradient_accumulation_steps
      accelerator.backward(loss)
      if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)
          completed_steps += 1

      if completed_steps >= max_train_steps:
          break

  model.eval()
  losses = []
  for step, batch in enumerate(eval_dataloader):
      with torch.no_grad():
          outputs = model(**batch)

      loss = outputs.loss
      losses.append(accelerator.gather(loss.repeat(per_device_eval_batch_size)))

  losses = torch.cat(losses)
  losses = losses[: len(eval_dataset)]
  perplexity = math.exp(torch.mean(losses))

accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)

output_dir='../model_dir/bert_MLM/'
unwrapped_model.save_pretrained(output_dir,save_function=accelerator.save)

Downloading: 100%|██████████| 331M/331M [02:19<00:00, 2.37MB/s]
 13%|█▎        | 3590/27600 [27:14<3:04:35,  2.17it/s]