In [1]:
import itertools
from pathlib import Path
from typing import *

import transformers
import datasets
import torch
import numpy as np
import rich
from beartype import beartype
from tqdm.notebook import tqdm

In [2]:
# example we found too late: https://github.com/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch.ipynb

In [3]:
config = transformers.GPT2LMHeadModel.from_pretrained("distilgpt2").config
model = transformers.GPT2LMHeadModel(config)
tokenizer = transformers.GPT2Tokenizer.from_pretrained("distilgpt2")

In [4]:
tokenizer.pad_token_id = [-100]

In [5]:
import math
split = "80%"
block_size = tokenizer.model_max_length

@beartype
def group_fn(sample: List[int], block_size: int) -> list:
    return [sample[i : i + block_size] for i in range(0, math.ceil(len(sample) / block_size), block_size)]


ds_t = datasets.load_dataset("wikitext", "wikitext-2-v1", split="train[:80%]")
ds_e = datasets.load_dataset("wikitext", "wikitext-2-v1", split="train[80%:]")
ds_t = ds_t.filter(lambda x: len(x["text"]) > 0)
ds_e = ds_e.filter(lambda x: len(x["text"]) > 0)
ds_t = ds_t.map(lambda x: tokenizer.batch_encode_plus(x["text"],), batched=True, remove_columns=["text"])
ds_e = ds_e.map(lambda x: tokenizer.batch_encode_plus(x["text"],), batched=True, remove_columns=["text"])


    https://www.python.org/dev/peps/pep-0585
Reusing dataset wikitext (/home/mila/g/gagnonju/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
Reusing dataset wikitext (/home/mila/g/gagnonju/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
Loading cached processed dataset at /home/mila/g/gagnonju/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-70194fc0022e6c77.arrow
Loading cached processed dataset at /home/mila/g/gagnonju/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-4bedad4d85684c5c.arrow


  0%|          | 0/19 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [6]:

def group_texts(examples, block_size):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

block_size = 256

ds_t = ds_t.map(
    lambda x: group_texts(x, block_size),
    batched=True,
    batch_size=1000,
    num_proc=4,
)

ds_e = ds_e.map(
    lambda x: group_texts(x, block_size),
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [7]:
args = transformers.TrainingArguments(
    do_eval=True, 
    evaluation_strategy="steps", 
    eval_steps=100, 
    output_dir="clm_experimentation_out/", 
    per_device_train_batch_size=30, 
    per_device_eval_batch_size=120,
)
trainer = transformers.Trainer(
    model=model, 
    tokenizer=tokenizer, 
    train_dataset=ds_t, 
    eval_dataset=ds_e, 
    args=args,
)

In [8]:
trainer.train()

***** Running training *****
  Num examples = 7478
  Num Epochs = 3
  Instantaneous batch size per device = 30
  Total train batch size (w. parallel, distributed & accumulation) = 90
  Gradient Accumulation steps = 1
  Total optimization steps = 252
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mretroboost[0m (use `wandb login --relogin` to force relogin)




Step,Training Loss,Validation Loss
100,No log,6.492918


***** Running Evaluation *****
  Num examples = 1972
  Batch size = 360


In [None]:
dir(datasets)

In [None]:
import datasets
acc = datasets.load_metric("accuracy")

In [None]:
dir(acc)