##Training a language model using Transformers

Training a language model is a more advanced form of machine learning, therefore we need to install some libraries that are not default for Google Colab. We also clone our GitHub repository as usual.

In [None]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install accelerate
!git clone https://github.com/Joaoffg/AISocIMP23/

After installing the libraries, we need to import them so we can use them in out code.

In [None]:
from datasets import load_dataset
import re
import os
import torch
from transformers import LlamaTokenizer, LlamaConfig, LlamaModel, LlamaForCausalLM, Trainer, TrainingArguments

This part loads all of the text data that you will use to train the language model.

In [None]:
dataset = load_dataset("text",
                       data_dir="/content/AISocIMP23/Week 4/Texts")


This part tokenizes the dataset, which means that it converts all of the words into numbers that can be processed by the neural network.

In [None]:
tokenizer = LlamaTokenizer.from_pretrained('/content/AISocIMP23/Week 4/Token')

def chunk_examples(examples,chunk_lenght=128, min_chunk_lenght = 25):
    chunks = []
    for text in examples["text"]:
        tokenized = tokenizer(text,add_special_tokens=False)
        if len(tokenized.input_ids) > min_chunk_lenght:
            input_ids = [tokenizer.bos_token_id] + tokenized.input_ids + [tokenizer.eos_token_id]
            attention_mask = [1] + tokenized.attention_mask + [1]
            for i in range(0, len(tokenized.input_ids), chunk_lenght):
                cunk_input_ids = input_ids[i:i + chunk_lenght]
                cunk_att_mask = attention_mask[i:i + chunk_lenght]
                cur_chunk_len = len(cunk_input_ids)

                if  cur_chunk_len < chunk_lenght:
                    cunk_input_ids = cunk_input_ids + [tokenizer.eos_token_id]*(chunk_lenght - cur_chunk_len)
                    cunk_att_mask = cunk_att_mask + [0]*(chunk_lenght - cur_chunk_len)

                chunks += [{"input_ids":torch.tensor(cunk_input_ids),
                        "attention_mask": torch.tensor(cunk_att_mask),
                        "labels": torch.tensor(cunk_input_ids)}
                        #"raw":tokenizer.decode(cunk_input_ids)}

                        ]


    return {"chunks": chunks}


chunked_dataset = dataset.map(chunk_examples, batched=True, remove_columns=['text'])

Here we define our model architecture, we are using a LlaMa based model for this exercise. You can change the complexity factor below to make the model more simple or more complex.

In [None]:
complexity_reduction=4

config = LlamaConfig(
    vocab_size = 32000,
    hidden_size= int(2048/complexity_reduction),
    intermediate_size = int(5120/complexity_reduction),
    num_hidden_layers = int(16/complexity_reduction),
    num_attention_heads = int(16/complexity_reduction),
    max_position_embeddings = 2048 ,
    rms_norm_eps = 1e-12
)

model = LlamaForCausalLM(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"LlaMa Model Size: {model_size/1000**2:.1f}M parameters")

tokenizer.pad_token = tokenizer.eos_token

Here you define the training arguments. You can ignore most of them, they are default values, but you may want to tweak the batch_sizes and the learning rate.

In [None]:
args = TrainingArguments(
    output_dir="erasmian-lm/medium",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="no",
    eval_steps=5_000,
    logging_steps=5_000,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=0.0001,
    save_steps=5_000,
    fp16=True,
    save_strategy = "epoch", #save only latest model at end of epoch instead of 5k steps.
    save_total_limit = 1 # save only latest 3 epochs
)


This step trains the model, exciting!

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=chunked_dataset['train']['chunks']

)

model = torch.compile(model)
trainer.train()

And here you can test how the model is actually performing by generating some text. You can write the start of the text between "" in the input_text field.

In [None]:
from transformers import GenerationConfig

input_text= "Erasmus University is"

generation_config = GenerationConfig(
    temperature=1,
    top_p=0.95,
    top_k=50,
    repetition_penalty=1,
    do_sample=True,
    num_return_sequences=1
)

inputs = tokenizer(input_text, return_tensors="pt")
inputs = inputs.to("cuda:0")
model = model.to("cuda:0")
outputs = model.generate(**inputs, num_beams=1, do_sample=True, max_length=128)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))