In [1]:
import transformers
print(transformers.__version__)

4.28.1


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers.utils import send_example_telemetry
send_example_telemetry("language_modeling_notebook", framework="pytorch")

In [3]:
from datasets import load_dataset

wikitext = load_dataset("wikitext", "wikitext-2-raw-v1")
wikitext_doc = load_dataset("EleutherAI/wikitext_document_level", "wikitext-103-raw-v1")
wikicorpus = load_dataset("wikicorpus", "raw_en")

Found cached dataset wikitext (/home/lxyuan/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 72.93it/s]
Found cached dataset wikitext_document_level (/home/lxyuan/.cache/huggingface/datasets/EleutherAI___wikitext_document_level/wikitext-103-raw-v1/1.0.0/c7f10a7786444f898dd236db33d4bee9b130f8cbcac690e7bde9b0d027e19fc1)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 30.26it/s]
Found cached dataset wikicorpus (/home/lxyuan/.cache/huggingface/datasets/wikicorpus/raw_en/0.0.0/6dff92752a49f4e34e7562070fd35f469a684a915648fabfc18c7bdd25fde3bd)
100%|████████████████████

##### Preview dataset and standard to `text` column

In [4]:
wikitext

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [5]:
wikitext_doc

DatasetDict({
    test: Dataset({
        features: ['page'],
        num_rows: 62
    })
    train: Dataset({
        features: ['page'],
        num_rows: 29444
    })
    validation: Dataset({
        features: ['page'],
        num_rows: 60
    })
})

In [6]:
wikicorpus

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'text'],
        num_rows: 1359146
    })
})

In [7]:
wikitext_doc = wikitext_doc.rename_column("page", "text")
wikitext_doc

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 62
    })
    train: Dataset({
        features: ['text'],
        num_rows: 29444
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 60
    })
})

In [8]:
wikicorpus["train"] = wikicorpus["train"].remove_columns(["id", "title"])

In [9]:
wikicorpus

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1359146
    })
})

##### Combine them as one big datasetdict

In [10]:
from datasets import concatenate_datasets, DatasetDict

wiki_train = concatenate_datasets([wikitext["train"], wikicorpus["train"], wikitext_doc["train"]])
wiki_val = concatenate_datasets([wikitext["validation"], wikitext_doc["validation"]])
wiki_test = concatenate_datasets([wikitext["test"], wikitext_doc["test"]])

In [11]:
wiki_test

Dataset({
    features: ['text'],
    num_rows: 4420
})

In [12]:
datasets = DatasetDict()

datasets["train"] = wiki_train
datasets["validation"] = wiki_val
datasets["test"] = wiki_test

In [13]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1425308
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3820
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4420
    })
})

In [14]:
model_checkpoint = "distilgpt2"

In [15]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [16]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [17]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

Loading cached processed dataset at /home/lxyuan/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-accf673833a674d4_*_of_00004.arrow
Loading cached processed dataset at /home/lxyuan/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-fb8ed8fc73149429_*_of_00004.arrow
Loading cached processed dataset at /home/lxyuan/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-5c27cd15b892403e_*_of_00004.arrow


In [18]:
tokenized_datasets["train"][1]

{'input_ids': [796, 569, 18354, 7496, 17740, 6711, 796, 220, 198],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [19]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1425308
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3820
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4420
    })
})

In [20]:
block_size = 256

In [21]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [22]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Loading cached processed dataset at /home/lxyuan/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-47653298c8756f40_*_of_00004.arrow
Loading cached processed dataset at /home/lxyuan/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-49ee38042176ce98_*_of_00004.arrow
Loading cached processed dataset at /home/lxyuan/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-79bb773d955e10cc_*_of_00004.arrow


In [23]:
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3669733
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1940
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2220
    })
})

In [24]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

In [25]:
from transformers import Trainer, TrainingArguments

In [26]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-wiki",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    gradient_accumulation_steps=32,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    push_to_hub=False,
)

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
0,3.476,3.37038
1,3.3828,3.320355
2,3.3466,3.292329
3,3.3248,3.2728
4,3.3086,3.259739
5,3.2977,3.250431
6,3.2889,3.243709
7,3.2834,3.23863
8,3.279,3.2359


In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
test_results = trainer.evaluate(lm_datasets["test"])
print(f"Perplexity: {math.exp(test_results['eval_loss']):.2f}")

In [None]:
tokenizer.save_pretrained("./models/tokenizer/")
trainer.save_model()