In [1]:
# import necessary packages
import sys, os
import torch 
import numpy as np
import evaluate
import accelerate
from accelerate import Accelerator
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from trl import SFTTrainer, setup_chat_format
from transformers import (pipeline,
                          AutoTokenizer,
                          AutoModelForCausalLM,
                          DataCollatorForLanguageModeling,
                          DataCollatorWithPadding,
                          get_scheduler)
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from IPython.display import clear_output

sys.path.append('../')

# custom imports
from utils.GetLowestGPU import GetLowestGPU

device = GetLowestGPU()

[2024-06-09 19:12:25,282] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio
collect2: error: ld returned 1 exit status


Device set to cuda:1


In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Instantiate Model and Dataset

In [3]:
# options
model_path = "meta-llama/Meta-Llama-3-8B"
dataset_path = "allenai/peS2o"

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

accelerator = Accelerator()

# load tokenizer and model
pipeline = pipeline('text-generation', 
                    model=model_path,
                    model_kwargs={'torch_dtype': torch.bfloat16},
                    device_map = 'auto',
                    
                    )

pipeline.model = get_peft_model(pipeline.model, peft_config)
pipeline.tokenizer.pad_token = pipeline.tokenizer.eos_token
pipeline.model.generation_config.pad_token_id = pipeline.tokenizer.eos_token_id

pipeline.model.print_trainable_parameters()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424


In [4]:
# load dataset
raw_dataset = load_dataset(dataset_path, "v2", streaming=True, trust_remote_code=True)

# check format of data
raw_dataset

IterableDatasetDict({
    train: IterableDataset({
        features: ['added', 'created', 'id', 'source', 'text', 'version'],
        n_shards: 20
    })
    validation: IterableDataset({
        features: ['added', 'created', 'id', 'source', 'text', 'version'],
        n_shards: 2
    })
})

# Preprocessing

In [5]:
# define functions
def preprocess_data(examples):
    tokenized_data = pipeline.tokenizer(text=examples['text'],
                               padding='max_length', 
                               truncation=True, 
                               max_length=512)
    
    labels = tokenized_data['input_ids'].copy()

    for i in range(len(labels)):
        if labels[i][-1] != pipeline.tokenizer.pad_token_id:
            labels[i] = labels[i][1:] + [pipeline.tokenizer.pad_token_id]
        else:
            labels[i] = labels[i][1:] + [-100]

    labels = [[-100 if x == pipeline.tokenizer.pad_token_id else x for x in y] for y in labels]
    tokenized_data['labels'] = labels
    
    return tokenized_data

In [6]:
# add special tokens to tokenizer
pipeline.tokenizer.pad_token = pipeline.tokenizer.eos_token
pipeline.model.resize_token_embeddings(len(pipeline.tokenizer))

tokenized_dataset = raw_dataset.map(preprocess_data,
                                    batched=True,
                                    remove_columns=raw_dataset['train'].column_names,)
tokenized_dataset.with_format("torch")

# check tokenized dataset output
tokenized_dataset

IterableDatasetDict({
    train: IterableDataset({
        features: Unknown,
        n_shards: 20
    })
    validation: IterableDataset({
        features: Unknown,
        n_shards: 2
    })
})

# Create Dataloaders

In [7]:
# instantiate data collator
data_collator = DataCollatorWithPadding(tokenizer=pipeline.tokenizer)

train_dataloader = DataLoader(tokenized_dataset['train'],
                              batch_size=8, 
                              collate_fn=data_collator,
                              num_workers=20)

val_dataloader = DataLoader(tokenized_dataset['validation'],
                            batch_size=8,
                            collate_fn=data_collator,
                            num_workers=2)

In [8]:
# inspect sample batch
batch = next(iter(train_dataloader))
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 512]),
 'attention_mask': torch.Size([8, 512]),
 'labels': torch.Size([8, 512])}

In [9]:
outputs = pipeline.model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(11.4993, grad_fn=<ToCopyBackward0>) torch.Size([8, 512, 128256])


# Training

In [10]:
# run a test prediction
messages = ["network biology is"]

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs)

[[{'generated_text': 'network biology is the study of the interactions of the elements of biological systems, such as genes, proteins and metabolic pathways. It studies biological function in terms of the network rather than individual parts. The goal of the field is to understand the structure and dynamics of the networks controlling cell function and to describe the mechanisms that govern their activity. In practice, network biology is the application of the tools and approaches of network theory to the study of biological systems. The field is also called systems biology, but the latter term is sometimes used to refer specifically to the study of biochemical reactions and the flow of information through a cell. The term network biology was introduced by Marc Vidal in 2004. In 2005, Vidal and Albert-László Barabási co-edited a special issue of Nature Reviews Genetics on network biology. In 2006, Barabási and Tanya Berger-Wolf co-edited a special issue of the Journal of Computational 

In [11]:
# init optimizer
optimizer = AdamW(pipeline.model.parameters(), lr=1e-7)

# init scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=1000,
    num_training_steps=30000
)

pipeline.model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
    pipeline.model, optimizer, train_dataloader, lr_scheduler)

In [12]:
# text for example after training
text = ["Network biology is"]

num_epochs = 3

# loop
for epoch in range(num_epochs):
    
    print("=====================")
    print(f"Epoch {epoch + 1}")
    print("=====================")

    # set model to train mode
    pipeline.model.train()

    # initialize train loss, val loss
    running_train_loss = 0.0
    running_val_loss = 0.0

    # loop through train data
    print("Training...")
    i = 0
    for batch in train_dataloader:

        # grab batch and map to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # forward pass
        outputs = pipeline.model(**batch)
        loss = outputs.loss


        print(f"batch loss: {loss:.4f}\r", end="")

        running_train_loss += loss.item()

        # backward pass
        accelerator.backward(loss)

        # update optimizer, scheduler
        optimizer.step()
        lr_scheduler.step()

        # zero gradients
        optimizer.zero_grad()

        i += 1
        if i % 250 == 0:
            print(f"Processed {i} batches; Printing example response...")
            print(pipeline(text, max_length=100, truncation=True))
        
        if i == 10000:
            break
        
    # set model to eval mode
    pipeline.model.eval()

    for batch in val_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = pipeline.model(**batch)
            loss = outputs.loss
            running_val_loss += loss.item()
        
    val_loss = running_val_loss / len(val_dataloader)

    print("Printing example response...")
    print(pipeline(text, max_length=100, truncation=True))

    train_loss = running_train_loss / len(train_dataloader)
    print(f"Avg. Train Loss: {train_loss:.4f}, Avg. Val Loss: {val_loss:.4f}")
    # print("Evaluation metrics:", metric.compute())

print("Training Complete!")

Epoch 1
Training...
batch loss: 11.4993

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Processed 250 batches; Printing example response...
[[{'generated_text': 'Network biology is an emerging interdisciplinary field that studies biological systems using a network perspective. It is an integrative approach that combines experimental, computational and theoretical methods to understand the interactions between molecules, cells and tissues in health and disease. It is a rapidly developing field, with new methods and applications being developed at a rapid pace.\nThe main goal of this course is to provide an introduction to the field of network biology, with a focus on methods and applications. The course will cover a wide range of topics,'}]]
Processed 500 batches; Printing example response...
[[{'generated_text': 'Network biology is a field of science that studies biological systems using a combination of network theory and biological data. The systems studied include, but are not limited to, metabolic networks, gene networks, cellular networks, and neural networks. Networ

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processed 2500 batches; Printing example response...
[[{'generated_text': 'Network biology is a field of science that deals with the structure and function of biological networks. The most common biological networks are gene regulatory networks, protein interaction networks, metabolic networks, and signalling networks. A biological network can be represented as a graph where the nodes represent the network components (e.g. genes, proteins, metabolites) and the edges represent the interactions between them (e.g. regulatory interactions, physical interactions, metabolic reactions). Biological networks can be used to study a wide range of biological processes'}]]
Processed 2750 batches; Printing example response...
[[{'generated_text': 'Network biology is the study of biological systems in which the behavior of the whole system is not easily inferred from the behavior of its parts. The field is often referred to as biological network science, and the systems studied are often referred to 

KeyboardInterrupt: 

In [13]:
# run a test prediction
messages = ["network biology is"]

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs)