In [1]:
# import necessary packages
import sys, os
import torch 
import numpy as np
import evaluate
from trl import SFTTrainer, setup_chat_format
from transformers import (pipeline,
                          AutoTokenizer,
                          AutoModelForCausalLM,
                          DataCollatorForLanguageModeling,
                      ``    get_scheduler)
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from IPython.display import clear_output

sys.path.append('../')

# custom imports
from utils.GetLowestGPU import GetLowestGPU

device = GetLowestGPU()

Device set to cuda:3


In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Instantiate Model and Dataset

In [3]:
# options
model_path = "meta-llama/Meta-Llama-3-8B"
dataset_path = "allenai/peS2o"

# load tokenizer and model
pipeline = pipeline('text-generation', 
                    model=model_path,
                    model_kwargs={'torch_dtype': torch.bfloat16},
                    device_map = 'auto'
                    )

model, tokenizer = pipeline.model, pipeline.tokenizer



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# load dataset
raw_dataset = load_dataset(dataset_path, "v2", streaming=True, trust_remote_code=True)

# check format of data
raw_dataset

IterableDatasetDict({
    train: IterableDataset({
        features: ['added', 'created', 'id', 'source', 'text', 'version'],
        n_shards: 20
    })
    validation: IterableDataset({
        features: ['added', 'created', 'id', 'source', 'text', 'version'],
        n_shards: 2
    })
})

# Preprocessing

In [7]:
# define functions
def preprocess_data(examples):
    text = examples['text']
    tokenized_data = tokenizer(text=text,
                               text_target=text,
                               max_length=512, 
                               return_tensors='pt',
                               truncation=True, 
                               padding='max_length')
    
    return tokenized_data

In [8]:
# add special tokens to tokenizer
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

tokenized_dataset = raw_dataset.map(preprocess_data, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['added', 'created', 'id', 'source', 'text', 'version'])
tokenized_dataset.with_format("torch")

# check tokenized dataset output
tokenized_dataset

IterableDatasetDict({
    train: IterableDataset({
        features: Unknown,
        n_shards: 20
    })
    validation: IterableDataset({
        features: Unknown,
        n_shards: 2
    })
})

In [9]:
next(iter(tokenized_dataset['train']))

{'input_ids': tensor([128000,     58,  12755,   9860,  27375,    315,  37229,  15131,   6629,
          29413,   2908,  43738,    449,    445,    946,     82,  21075,  29413,
           2908,  30662,  44947,   6674,    198,   1271,  19874,    279,  34933,
          15105,    323,   2875,   9860,  27375,    315,  37229,  15131,   6629,
          29413,   2908,    320,  56493,      8,  43738,    449,    445,  17485,
          21075,  29413,   2908,   4286,  39174,     50,    198,  26556,   6841,
            220,   1049,     23,    323,   5936,    220,    679,     15,     11,
          80679,   6978,    449,  44561,  11134,   1051,  12020,    449,    445,
          17485,  21075,  29413,   2908,   1234,    802,    339,  90879,    323,
           7946,   8272,    709,     13,   2684,   1051,    220,   3971,  25000,
            323,    220,   1682,  28585,     11,  20330,    505,    220,   1114,
            311,    220,   3391,   1667,    449,    459,   5578,    315,    220,
           1682

# Create Dataloaders

In [11]:
# instantiate data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

train_dataloader = DataLoader(tokenized_dataset['train'],
                              batch_size=8, 
                              collate_fn=data_collator,
                              num_workers=20)

val_dataloader = DataLoader(tokenized_dataset['validation'],
                            batch_size=8,
                            collate_fn=data_collator,
                            num_workers=2)

In [12]:
# inspect sample batch
batch = next(iter(train_dataloader))

{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 512]),
 'attention_mask': torch.Size([8, 512]),
 'labels': torch.Size([8, 512])}

In [13]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(1.8214, grad_fn=<ToCopyBackward0>) torch.Size([8, 512, 128256])


# Training

In [12]:
# initialize optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# and scheduler
num_epochs = 3
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

print(num_training_steps)

300


In [13]:
# eval loop

# define metrics
# metric = evaluate.load("glue", "mrpc")

# loop through epochs
for epoch in range(num_epochs):
    
    clear_output(wait=True)

    print(f"Epoch {epoch + 1}\n=====================")

    # set model to train mode
    model.train()

    # initialize train loss, val loss
    train_loss = 0.0
    val_loss = 0.0

    # loop through train data
    print("Training...")
    for batch in train_dataloader:

        # grab batch and map to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # forward pass
        outputs = model(**batch)
        loss = outputs.loss

        train_loss += loss.item()

        # backward pass
        loss.backward()

        # update optimizer
        optimizer.step()

        # update scheduler
        lr_scheduler.step()

        # zero gradients
        optimizer.zero_grad()

    train_loss = train_loss / (len(train_dataloader) / batch_size)

    # set to eval mode
    model.eval()
    print("Validating...")
    for batch in val_dataloader:

        # get batch
        batch = {k: v.to(device) for k, v in batch.items()}

        # forward pass
        with torch.no_grad():
            outputs = model(**batch)

        # get loss
        loss = outputs.loss
        val_loss += loss.item()

        # get logits, predictions
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        # metric.add_batch(predictions=predictions, references=batch["labels"])


    val_loss = val_loss / (len(val_dataloader) / batch_size)

    print(f"Avg. Train Loss: {train_loss}, Avg. Val Loss: {val_loss}")
    # print("Evaluation metrics:", metric.compute())


Epoch 1
Training...


0it [00:00, ?it/s]

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


# Prediction

In [15]:
# run a test prediction
messages = [

]

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][-1])

{'role': 'assistant', 'content': ' \n \n           \n   \n          \n                                                          -    —    -  —    —    —  —  — -•  — - ————————————————————————————————————————————————————————————————————————~———~———~—————————~————~—————————————————————————————————————————————~——-——'}
