In [1]:
# !pip install transformers
# !pip install datasets
# !pip install accelerate

In [1]:
import logging
import math
import os
import random

import datasets
import torch
from datasets import load_dataset
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm

import transformers
from accelerate import Accelerator
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AdamW,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    SchedulerType,
    get_scheduler,
    set_seed,
)


In [2]:
datasets = load_dataset('text', data_files={'train': './data/THUCNewsChinese.txt',
                                      'validation': './data/THUCNewsChinese.txt'})


Using custom data configuration default-3ba77d458fdd72a6


Downloading and preparing dataset text/default to C:\Users\Nan\.cache\huggingface\datasets\text\default-3ba77d458fdd72a6\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


100%|██████████| 2/2 [00:00<00:00, 2008.77it/s]
100%|██████████| 2/2 [00:00<00:00, 499.80it/s]


Dataset text downloaded and prepared to C:\Users\Nan\.cache\huggingface\datasets\text\default-3ba77d458fdd72a6\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 331.09it/s]


In [3]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 200000
    })
})

In [9]:
model_checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
def tokenize_function(examples):
    return tokenizer(examples["text"])

ValueError: check_hostname requires server_hostname

In [25]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

#0:   0%|          | 0/50 [00:00<?, ?ba/s]
#1:   0%|          | 0/50 [00:00<?, ?ba/s][A

#2:   0%|          | 0/50 [00:00<?, ?ba/s][A[A


#0:   2%|▏         | 1/50 [00:00<00:05,  8.64ba/s]
#1:   2%|▏         | 1/50 [00:00<00:05,  8.66ba/s][A

#2:   2%|▏         | 1/50 [00:00<00:05,  9.30ba/s][A[A


#3:   4%|▍         | 2/50 [00:00<00:04, 11.52ba/s][A[A[A
#1:   4%|▍         | 2/50 [00:00<00:05,  8.76ba/s][A

#0:   6%|▌         | 3/50 [00:00<00:05,  9.21ba/s]
#1:   6%|▌         | 3/50 [00:00<00:05,  8.89ba/s][A

#2:   6%|▌         | 3/50 [00:00<00:05,  9.12ba/s][A[A


#3:   8%|▊         | 4/50 [00:00<00:03, 11.65ba/s][A[A[A
#1:   8%|▊         | 4/50 [00:00<00:05,  9.09ba/s][A

#0:  10%|█         | 5/50 [00:00<00:04,  9.41ba/s]
#0:  12%|█▏        | 6/50 [00:00<00:07,  5.72ba/s]


#3:  12%|█▏        | 6/50 [00:00<00:05,  7.86ba/s][A[A[A
#1:  12%|█▏        | 6/50 [00:00<00:07,  5.73ba/s][A

#0:  14%|█▍        | 7/50 [00:00<00:06,  6.55ba/s]


#3:  16%|█▌        | 8/50 [

In [19]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids'],
        num_rows: 200000
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids'],
        num_rows: 200000
    })
})

In [20]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [27]:
block_size = 128
preprocessing_num_workers = 4

tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=preprocessing_num_workers,
)

#0:   0%|          | 0/50 [00:00<?, ?ba/s]
#1:   0%|          | 0/50 [00:00<?, ?ba/s][A

#2:   0%|          | 0/50 [00:00<?, ?ba/s][A[A


#0:   2%|▏         | 1/50 [00:00<00:10,  4.89ba/s]


#3:   2%|▏         | 1/50 [00:00<00:09,  5.00ba/s][A[A[A
#1:   2%|▏         | 1/50 [00:00<00:14,  3.40ba/s][A

#0:   4%|▍         | 2/50 [00:00<00:09,  4.92ba/s]


#3:   4%|▍         | 2/50 [00:00<00:09,  4.99ba/s][A[A[A
#1:   4%|▍         | 2/50 [00:00<00:12,  3.81ba/s][A

#0:   6%|▌         | 3/50 [00:00<00:09,  4.84ba/s]


#3:   6%|▌         | 3/50 [00:00<00:09,  4.75ba/s][A[A[A
#1:   6%|▌         | 3/50 [00:00<00:11,  4.03ba/s][A

#0:   8%|▊         | 4/50 [00:00<00:09,  4.89ba/s]


#3:   8%|▊         | 4/50 [00:00<00:09,  4.71ba/s][A[A[A

#2:   8%|▊         | 4/50 [00:00<00:10,  4.34ba/s][A[A
#0:  10%|█         | 5/50 [00:01<00:09,  4.93ba/s]


#3:  10%|█         | 5/50 [00:01<00:09,  4.87ba/s][A[A[A

#2:  10%|█         | 5/50 [00:01<00:10,  4.45ba/s][A[A
#0:  12%|█▏  

In [30]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

In [32]:
from transformers import DataCollatorForLanguageModeling

num_train_epochs = 3
gradient_accumulation_steps = 1
per_device_train_batch_size = 8
per_device_eval_batch_size = 8
gradient_accumulation_steps = 1
learning_rate = 5e-5


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=per_device_train_batch_size)
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=per_device_eval_batch_size)

In [46]:
accelerator = Accelerator()
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
total_batch_size = per_device_train_batch_size * accelerator.num_processes * gradient_accumulation_steps

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
      model, optimizer, train_dataloader, eval_dataloader)



# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
max_train_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=max_train_steps,
)

# Train!
total_batch_size = per_device_train_batch_size * accelerator.num_processes * gradient_accumulation_steps


# Only show the progress bar once on each machine.
progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps = 0
for epoch in range(num_train_epochs):
  model.train()
  for step, batch in enumerate(train_dataloader):
      outputs = model(**batch)
      loss = outputs.loss
      loss = loss / gradient_accumulation_steps
      accelerator.backward(loss)
      if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)
          completed_steps += 1

      if completed_steps >= max_train_steps:
          break

  model.eval()
  losses = []
  for step, batch in enumerate(eval_dataloader):
      with torch.no_grad():
          outputs = model(**batch)

      loss = outputs.loss
      losses.append(accelerator.gather(loss.repeat(per_device_eval_batch_size)))

  losses = torch.cat(losses)
  losses = losses[: len(eval_dataset)]
  perplexity = math.exp(torch.mean(losses))

accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)

output_dir='../model_dir/bert_MLM/'
unwrapped_model.save_pretrained(output_dir,save_function=accelerator.save)

HBox(children=(FloatProgress(value=0.0, max=7218.0), HTML(value='')))