In [5]:
# import necessary packages
import sys, os
import torch 
import numpy as np
import evaluate
from trl import SFTTrainer, setup_chat_format
from transformers import (pipeline,
                          AutoTokenizer,
                          AutoModelForCausalLM,
                          DataCollatorForLanguageModeling,
                          get_scheduler)
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from IPython.display import clear_output

sys.path.append('../')

# custom imports
from utils.GetLowestGPU import GetLowestGPU

device = GetLowestGPU()

Device set to cuda:0


In [6]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Instantiate Model and Dataset

In [8]:
# options
model_path = "meta-llama/Meta-Llama-3-8B"
dataset_path = "webis/tldr-17" #test dataset

# load tokenizer and model
pipeline = pipeline('text-generation', 
                    model=model_path,
                    model_kwargs={'torch_dtype': torch.bfloat16},
                    device_map = 'auto'
                    )

ValueError: The following `model_kwargs` are not used by the model: ['model', 'model_kwargs', 'device_map'] (note: typos in the generate arguments will also show up in this list)

In [9]:
# test model
text = "the answer to life is"
print(pipeline(text, max_length=100, do_sample=True, temperature=0.9)[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


the answer to life is 42”)
  5. The number of weeks from September 2006 to August 2011, when the first edition of this book was published.
  6. The number of hours of daylight on the summer solstice in the northern hemisphere. The same number of hours of night fall in the southern hemisphere on the same day.
  7. The number of seconds that a “long ton” of 2240 pounds, weighs.
  8


In [10]:
# load dataset
raw_dataset = load_dataset(dataset_path, split = 'train[:5000]')



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/4.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.79k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.14k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3848330 [00:00<?, ? examples/s]

# Preprocessing

In [49]:
# define functions
def join_text(examples):
    text = [" ".join(entry["text"]) for entry in examples["answers"]]
    examples["text"] = text
    return examples

def preprocess_data(examples):
    inp = examples["text"]

    tokenized_data = tokenizer(text = inp,
                               padding="max_length", 
                               truncation=True, 
                               max_length=512)
    return tokenized_data

In [53]:
# add special tokens to tokenizer
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

preprocessed_dataset = raw_dataset.map(join_text,
                                       batched=True)

tokenized_dataset = preprocessed_dataset.map(preprocess_data, 
                                    batched=True,
                                    remove_columns=preprocessed_dataset.column_names
                                    )
tokenized_dataset.with_format("torch")

# check tokenized dataset output
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 5000
})

In [54]:
# test before training
text = "The answer to the ultimate question of life, the universe and everything is"
print(pipeline(text, max_length=100)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The answer to the ultimate question of life, the universe and everything is 42. The question is what is the meaning of life? And the answer is 42. The question is what is the meaning of life? And the answer is 42. The question is what is the meaning of life? And the answer is 42. The question is what is the meaning of life? And the answer is 42. The question is what is the meaning of life? And the answer is 


# Create Dataloaders

In [55]:
# instantiate data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# split data
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

train_dataloader = DataLoader(tokenized_dataset['train'],
                              batch_size=8, 
                              collate_fn=data_collator)

val_dataloader = DataLoader(tokenized_dataset['test'],
                            batch_size=8,
                            collate_fn=data_collator)

In [56]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4500
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 500
    })
})

In [11]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(1.7970, grad_fn=<ToCopyBackward0>) torch.Size([8, 128, 128256])


# Training

In [52]:
# initialize optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# and scheduler
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

print(num_training_steps)

NameError: name 'train_dataloader' is not defined

In [13]:
# loop through epochs
for epoch in range(num_epochs):
    clear_output(wait=True)
    
    print("=====================")
    print(f"Epoch {epoch + 1}")
    print("=====================")

    # set model to train mode
    model.train()

    # initialize train loss, val loss
    train_loss = 0.0
    val_loss = 0.0

    # loop through train data
    print("Training...")
    for batch in train_dataloader:

        # grab batch and map to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # forward pass
        outputs = model(**batch)
        loss = outputs.loss

        train_loss += loss.item()

        # backward pass
        loss.backward()

        # update optimizer
        optimizer.step()

        # update scheduler
        lr_scheduler.step()

        # zero gradients
        optimizer.zero_grad()

    train_loss = train_loss / (len(train_dataloader) / batch_size)

    # set to eval mode
    model.eval()
    print("Validating...")
    for batch in val_dataloader:

        # get batch
        batch = {k: v.to(device) for k, v in batch.items()}

        # forward pass
        with torch.no_grad():
            outputs = model(**batch)

        # get loss
        loss = outputs.loss
        val_loss += loss.item()


    val_loss = val_loss / (len(val_dataloader) / batch_size)

    print(f"Avg. Train Loss: {train_loss}, Avg. Val Loss: {val_loss}")
    # print("Evaluation metrics:", metric.compute())


Epoch 3
Training...


  0%|          | 0/563 [00:00<?, ?it/s]

Validating...


  0%|          | 0/63 [00:00<?, ?it/s]

Avg. Train Loss: 1.0257362152712808, Avg. Val Loss: 18.35613748762343


# Prediction

In [15]:
# test after training
text = "The answer to the ultimate question of life, the universe and everything is"
print(pipeline(text, max_length=100)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The answer to the ultimate question of life, the universe and everything is... * The universe is huge but not infinite * There are many many many many many many many many many many many many many many many many many many many many many many many many many many many many many many many many many many many many many manythe answer to the ultimate question of life, the universe and everything is... * The universe is huge but not infinite * There are many many many many many many many many many
