In [1]:
# import necessary packages
import sys
import torch 
import numpy as np
import evaluate
from trl import SFTTrainer, setup_chat_format
from transformers import (pipeline,
                          AutoTokenizer,
                          AutoModelForCausalLM,
                          DataCollatorWithPadding,
                          get_scheduler)
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from IPython.display import clear_output

sys.path.append('../')

# custom imports
from utils.GetLowestGPU import GetLowestGPU

device = GetLowestGPU()

Device set to cuda:0


# Instantiate Model and Dataset

In [2]:
# options
model_path = "meta-llama/Meta-Llama-3-8B"
dataset_path = "allenai/peS2o"

# load tokenizer and model
pipeline = pipeline('text-generation', 
                    model=model_path,
                    model_kwargs={'torch_dtype': torch.bfloat16},
                    device_map = 'auto'
                    )

model, tokenizer = pipeline.model, pipeline.tokenizer



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
# load dataset
raw_dataset = load_dataset(dataset_path, "v2", streaming=True)

# check format of data
raw_dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


IterableDatasetDict({
    train: IterableDataset({
        features: ['added', 'created', 'id', 'source', 'text', 'version'],
        n_shards: 20
    })
    validation: IterableDataset({
        features: ['added', 'created', 'id', 'source', 'text', 'version'],
        n_shards: 2
    })
})

# Preprocessing

In [20]:
# define functions
def preprocess_data(examples):
    text = examples['text']
    return tokenizer(text=text,
                     text_target=text,
                     max_length=512, 
                     return_tensors='pt',
                     truncation=True, 
                     padding='max_length')

In [19]:
# add special tokens to tokenizer
tokenizer.add_special_tokens({'pad_token': '<pad>'})
model.resize_token_embeddings(len(tokenizer))

tokenized_dataset = raw_dataset.map(preprocess_data, batched=True)
tokenized_dataset.with_format("torch")

# check tokenized dataset output
tokenized_dataset

IterableDatasetDict({
    train: IterableDataset({
        features: Unknown,
        n_shards: 20
    })
    validation: IterableDataset({
        features: Unknown,
        n_shards: 2
    })
})

In [13]:
next(iter(tokenized_dataset['train']))

{'added': '2018-04-03T04:47:30.209Z',
 'created': '2011-08-01T00:00:00.000Z',
 'id': '40474210',
 'source': 's2ag/train',
 'text': '[Short-term effectiveness of anterior cruciate ligament reconstruction with LARs artificial ligament].\n\nOBJECTIVE\nTo investigate the surgical technique and short-term effectiveness of anterior cruciate ligament (ACL) reconstruction with LARS artificial ligament.\n\n\nMETHODS\nBetween November 2008 and April 2010, eighty patients with ACL injury were treated with LARS artificial ligament under arthroscope and successfully followed up. There were 51 males and 29 females, aged from 17 to 43 years with an average of 29.2 years. The injuries were caused by sport in 63 cases, traffic accident in 14 cases, and bruise in 3 cases. There were 43 left knees and 37 right knees. The disease duration ranged from 10 days to 11 months. The anterior drawer test, Lachman test, and pivot shift test for all cases were rated as positive. The preoperative Lysholm score was 5

In [18]:
# instantiate data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# remove cols that are not needed and set format to torch
tokenized_dataset = tokenized_dataset.remove_columns(['added', 'created', 'id', 'source', 'text'])

NotImplementedError: Subclasses of Dataset should implement __getitem__.

# Create Dataloaders

In [8]:
train_dataloader = DataLoader(tokenized_dataset['train'],
                              batch_size=8, 
                              collate_fn=data_collator,
                              num_workers=20)

val_dataloader = DataLoader(tokenized_dataset['validation'],
                            batch_size=8,
                            collate_fn=data_collator,
                            num_workers=2)

In [9]:
# inspect sample batch
batch = next(iter(train_dataloader))

{k: v.shape for k, v in batch.items()}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/mnt/DGX01/Personal/krusepi/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py", line 759, in convert_to_tensors
    tensor = as_tensor(value)
  File "/mnt/DGX01/Personal/krusepi/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py", line 721, in as_tensor
    return torch.tensor(value)
ValueError: too many dimensions 'str'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/mnt/DGX01/Personal/krusepi/.venv/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/mnt/DGX01/Personal/krusepi/.venv/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 42, in fetch
    return self.collate_fn(data)
  File "/mnt/DGX01/Personal/krusepi/.venv/lib/python3.8/site-packages/transformers/data/data_collator.py", line 271, in __call__
    batch = pad_without_fast_tokenizer_warning(
  File "/mnt/DGX01/Personal/krusepi/.venv/lib/python3.8/site-packages/transformers/data/data_collator.py", line 66, in pad_without_fast_tokenizer_warning
    padded = tokenizer.pad(*pad_args, **pad_kwargs)
  File "/mnt/DGX01/Personal/krusepi/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py", line 3355, in pad
    return BatchEncoding(batch_outputs, tensor_type=return_tensors)
  File "/mnt/DGX01/Personal/krusepi/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py", line 224, in __init__
    self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
  File "/mnt/DGX01/Personal/krusepi/.venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py", line 775, in convert_to_tensors
    raise ValueError(
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`version` in this case) have excessive nesting (inputs type `list` where type `int` is expected).


In [14]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

{'logits': tensor([[[ 4.9688,  6.1250, 10.8125,  ...,  0.8398,  4.2188, -1.5625],
         [ 5.3125,  6.0625,  3.9375,  ..., -2.9375, -0.7812,  1.3281],
         [ 2.7031,  3.1406,  5.3438,  ...,  2.3281, -1.4453, -0.2109],
         ...,
         [ 5.3750,  7.0625,  4.3125,  ..., -2.9688, -1.7969,  0.0176],
         [ 5.2812,  7.3438,  4.6250,  ..., -2.8438, -1.3828, -0.3496],
         [ 5.3125,  7.3125,  4.3750,  ..., -2.9219, -1.2578, -0.3242]],

        [[ 4.9688,  6.1250, 10.8125,  ...,  0.8398,  4.2188, -1.5625],
         [ 5.3125,  6.0625,  3.9375,  ..., -2.9375, -0.7812,  1.3281],
         [ 2.7031,  3.1406,  5.3438,  ...,  2.3281, -1.4453, -0.2109],
         ...,
         [ 5.5938,  5.2188,  3.6094,  ..., -3.0781, -1.6172, -1.3203],
         [ 5.5000,  4.5312,  3.2969,  ..., -2.7031, -2.2031, -1.0938],
         [ 5.6562,  4.6875,  3.2031,  ..., -2.7344, -2.3125, -1.1328]],

        [[ 4.9688,  6.1250, 10.8125,  ...,  0.8398,  4.2188, -1.5625],
         [ 5.3125,  6.0625,  3.937

# Training

In [11]:
# initialize optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# and scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

print(num_training_steps)

300


In [12]:
# eval loop

# define metrics
# metric = evaluate.load("glue", "mrpc")

# loop through epochs
for epoch in range(num_epochs):
    
    clear_output(wait=True)

    print(f"Epoch {epoch + 1}\n=====================")

    # set model to train mode
    model.train()

    # initialize train loss, val loss
    train_loss = 0.0
    val_loss = 0.0

    # loop through train data
    print("Training...")
    for batch in tqdm(train_dataloader):

        # grab batch and map to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # forward pass
        outputs = model(**batch)
        loss = outputs.loss

        train_loss += loss.item()

        # backward pass
        loss.backward()

        # update optimizer
        optimizer.step()

        # update scheduler
        lr_scheduler.step()

        # zero gradients
        optimizer.zero_grad()

    train_loss = train_loss / (len(train_dataloader) / batch_size)

    # set to eval mode
    model.eval()
    print("Validating...")
    for batch in val_dataloader:

        # get batch
        batch = {k: v.to(device) for k, v in batch.items()}

        # forward pass
        with torch.no_grad():
            outputs = model(**batch)

        # get loss
        loss = outputs.loss
        val_loss += loss.item()

        # get logits, predictions
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        # metric.add_batch(predictions=predictions, references=batch["labels"])


    val_loss = val_loss / (len(val_dataloader) / batch_size)

    print(f"Avg. Train Loss: {train_loss}, Avg. Val Loss: {val_loss}")
    # print("Evaluation metrics:", metric.compute())


Epoch 3
Training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/100 [00:01<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validating...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Avg. Train Loss: 3.0257192957401275, Avg. Val Loss: 4.766729793548584


# Prediction

In [15]:
# run a test prediction
messages = [
    {"role": "system", "content": "You are a doctor."},
    {"role": "user", "content": "I have a headache. What should I do?"},
]

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][-1])

{'role': 'assistant', 'content': ' \n \n           \n   \n          \n                                                          -    —    -  —    —    —  —  — -•  — - ————————————————————————————————————————————————————————————————————————~———~———~—————————~————~—————————————————————————————————————————————~——-——'}
