In [13]:
!pip install transformers
!pip install datasets
!pip install accelerate



In [14]:
import logging
import math
import os
import random

import datasets
import torch
from datasets import load_dataset
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm

import transformers
from accelerate import Accelerator
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AdamW,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    SchedulerType,
    get_scheduler,
    set_seed,
)


In [15]:
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')

Reusing dataset wikitext (/root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20)


In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
datasets

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [18]:
model_checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [19]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

  

HBox(children=(FloatProgress(value=0.0, description='#0', max=2.0, style=ProgressStyle(description_width='init…

  

HBox(children=(FloatProgress(value=0.0, description='#1', max=2.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#2', max=2.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#3', max=2.0, style=ProgressStyle(description_width='init…

Token indices sequence length is longer than the specified maximum sequence length for this model (544 > 512). Running this sequence through the model will result in indexing errors






  

HBox(children=(FloatProgress(value=0.0, description='#0', max=10.0, style=ProgressStyle(description_width='ini…

 

HBox(children=(FloatProgress(value=0.0, description='#1', max=10.0, style=ProgressStyle(description_width='ini…

 

HBox(children=(FloatProgress(value=0.0, description='#2', max=10.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='#3', max=10.0, style=ProgressStyle(description_width='ini…

Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (528 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (638 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors






    

HBox(children=(FloatProgress(value=0.0, description='#0', max=1.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#3', max=1.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#2', max=1.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#1', max=1.0, style=ProgressStyle(description_width='init…







In [21]:
tokenized_datasets["train"][:2]

{'attention_mask': [[1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[0, 2],
  [0, 5457, 468, 44068, 6374, 41674, 6395, 5457, 1437, 50118, 2]]}

In [22]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [42]:
block_size = 128
preprocessing_num_workers = 4


tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=preprocessing_num_workers,
)

 

HBox(children=(FloatProgress(value=0.0, description='#0', max=2.0, style=ProgressStyle(description_width='init…

   

HBox(children=(FloatProgress(value=0.0, description='#1', max=2.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#2', max=2.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#3', max=2.0, style=ProgressStyle(description_width='init…





 

HBox(children=(FloatProgress(value=0.0, description='#0', max=10.0, style=ProgressStyle(description_width='ini…

 

HBox(children=(FloatProgress(value=0.0, description='#1', max=10.0, style=ProgressStyle(description_width='ini…

 

HBox(children=(FloatProgress(value=0.0, description='#2', max=10.0, style=ProgressStyle(description_width='ini…

 

HBox(children=(FloatProgress(value=0.0, description='#3', max=10.0, style=ProgressStyle(description_width='ini…





  

HBox(children=(FloatProgress(value=0.0, description='#0', max=1.0, style=ProgressStyle(description_width='init…

  

HBox(children=(FloatProgress(value=0.0, description='#1', max=1.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#2', max=1.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#3', max=1.0, style=ProgressStyle(description_width='init…







In [45]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

In [44]:
from transformers import DataCollatorForLanguageModeling

num_train_epochs = 3
gradient_accumulation_steps = 1
per_device_train_batch_size = 8
per_device_eval_batch_size = 8
gradient_accumulation_steps = 1
learning_rate = 5e-5


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
train_dataloader = DataLoader(lm_datasets['train'], shuffle=True, collate_fn=data_collator, batch_size=per_device_train_batch_size)
eval_dataloader = DataLoader(lm_datasets['validation'], collate_fn=data_collator, batch_size=per_device_eval_batch_size)

In [46]:
accelerator = Accelerator()
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
total_batch_size = per_device_train_batch_size * accelerator.num_processes * gradient_accumulation_steps

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
      model, optimizer, train_dataloader, eval_dataloader)



# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
max_train_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=max_train_steps,
)

# Train!
total_batch_size = per_device_train_batch_size * accelerator.num_processes * gradient_accumulation_steps


# Only show the progress bar once on each machine.
progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps = 0
for epoch in range(num_train_epochs):
  model.train()
  for step, batch in enumerate(train_dataloader):
      outputs = model(**batch)
      loss = outputs.loss
      loss = loss / gradient_accumulation_steps
      accelerator.backward(loss)
      if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)
          completed_steps += 1

      if completed_steps >= max_train_steps:
          break

  model.eval()
  losses = []
  for step, batch in enumerate(eval_dataloader):
      with torch.no_grad():
          outputs = model(**batch)

      loss = outputs.loss
      losses.append(accelerator.gather(loss.repeat(per_device_eval_batch_size)))

  losses = torch.cat(losses)
  losses = losses[: len(eval_dataset)]
  perplexity = math.exp(torch.mean(losses))

accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)

output_dir='/model_dir'
unwrapped_model.save_pretrained(output_dir,save_function=accelerator.save)

HBox(children=(FloatProgress(value=0.0, max=7218.0), HTML(value='')))