In [1]:
# import necessary packages
import sys, os
import torch 
import numpy as np
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from trl import SFTTrainer, setup_chat_format
from transformers import (pipeline,
                          AutoTokenizer,
                          AutoModelForCausalLM,
                          DataCollatorForLanguageModeling,
                          DataCollatorWithPadding,
                          get_scheduler)
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from IPython.display import clear_output

sys.path.append('../')

# custom imports
from utils.GetLowestGPU import GetLowestGPU

device = GetLowestGPU()

[2024-06-11 09:48:45,745] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio
collect2: error: ld returned 1 exit status


Device set to cuda:1


In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Instantiate Model and Dataset

In [3]:
# options
model_path = "meta-llama/Meta-Llama-3-8B"
dataset_path = "allenai/peS2o"

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

# load tokenizer and model
pipeline = pipeline('text-generation', 
                    model=model_path,
                    model_kwargs={'torch_dtype': torch.bfloat16},
                    device_map = 'auto'
                    )

pipeline.model = get_peft_model(pipeline.model, peft_config)
pipeline.tokenizer.pad_token = pipeline.tokenizer.eos_token
pipeline.model.generation_config.pad_token_id = pipeline.tokenizer.eos_token_id

pipeline.model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424


In [4]:
# load dataset
raw_dataset = load_dataset(dataset_path, "v2", streaming=True, trust_remote_code=True)

# check format of data
raw_dataset

IterableDatasetDict({
    train: IterableDataset({
        features: ['added', 'created', 'id', 'source', 'text', 'version'],
        n_shards: 20
    })
    validation: IterableDataset({
        features: ['added', 'created', 'id', 'source', 'text', 'version'],
        n_shards: 2
    })
})

# Preprocessing

In [5]:
# define functions
def preprocess_data(examples):
    tokenized_data = pipeline.tokenizer(text=examples['text'],
                               padding='max_length', 
                               truncation=True, 
                               max_length=128)
    
    labels = tokenized_data['input_ids'].copy()

    for i in range(len(labels)):
        if labels[i][-1] != pipeline.tokenizer.pad_token_id:
            labels[i] = labels[i][1:] + [pipeline.tokenizer.pad_token_id]
        else:
            labels[i] = labels[i][1:] + [-100]

    labels = [[-100 if x == pipeline.tokenizer.pad_token_id else x for x in y] for y in labels]
    tokenized_data['labels'] = labels
    
    return tokenized_data

In [6]:
# add special tokens to tokenizer
pipeline.tokenizer.pad_token = pipeline.tokenizer.eos_token
pipeline.model.resize_token_embeddings(len(pipeline.tokenizer))

tokenized_dataset = raw_dataset.map(preprocess_data,
                                    batched=True,
                                    remove_columns=raw_dataset['train'].column_names,)
tokenized_dataset.with_format("torch")

# check tokenized dataset output
tokenized_dataset

IterableDatasetDict({
    train: IterableDataset({
        features: Unknown,
        n_shards: 20
    })
    validation: IterableDataset({
        features: Unknown,
        n_shards: 2
    })
})

# Create Dataloaders

In [7]:
# instantiate data collator
data_collator = DataCollatorWithPadding(tokenizer=pipeline.tokenizer)

train_dataloader = DataLoader(tokenized_dataset['train'],
                              batch_size=8, 
                              collate_fn=data_collator,
                              num_workers=20)

val_dataloader = DataLoader(tokenized_dataset['validation'],
                            batch_size=8,
                            collate_fn=data_collator,
                            num_workers=2)

In [8]:
# inspect sample batch
batch = next(iter(train_dataloader))
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 128]),
 'attention_mask': torch.Size([8, 128]),
 'labels': torch.Size([8, 128])}

In [9]:
outputs = pipeline.model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(11.3959, grad_fn=<ToCopyBackward0>) torch.Size([8, 128, 128256])


# Training

In [10]:
# run a test prediction
text = ["Network biology is"]

terminators = [
    pipeline.tokenizer.eos_token_id
]

outputs = pipeline(
    text,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0][0]['generated_text'])

Network biology is the study of biological systems and processes using a network perspective. The approach is to treat a biological system as a network of interacting molecules, cells, or other entities. Network biology has been used to study a wide range of biological processes, including gene regulation, protein-protein interactions, and metabolic pathways.
What are the main principles of network biology?
The main principles of network biology are that biological systems can be modeled as networks of interacting molecules, cells, or other entities; that these networks can be used to understand the behavior of biological systems; and that the study of these networks can lead to new insights into biological processes.
How does network biology differ from traditional biology?
Network biology is a relatively new field that takes a different approach to studying biology than traditional biology. Traditional biology is focused on understanding the individual components of a biological syst

In [11]:
# pipeline.model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
#     pipeline.model, optimizer, train_dataloader, lr_scheduler)

In [12]:
# options
num_batches = 10_000
num_epochs = 1
best_val_loss = np.inf
checkpoint_path = '../checkpoints/checkpoint_{0}.pt'
log_path = '../logs/log.csv'

# init optimizer
optimizer = AdamW(pipeline.model.parameters(), lr=1e-5)

# init scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=1000,
    num_training_steps=num_epochs * num_batches,
)

with open(log_path, 'w') as f: 
    f.write(f'epoch,iter_num,train_loss,val_loss\n')

# loop
for epoch in range(num_epochs):

    clear_output(wait=True)

    print("=====================")
    print(f"Epoch {epoch + 1}")
    print("=====================")

    # initialize train loss, val loss
    running_train_loss = 0.0
    running_val_loss = 0.0

    # loop through train data
    print("Training...")
    i = 0
    with tqdm(total=num_batches) as pbar:
        for train_batch, val_batch in zip(train_dataloader, val_dataloader):
            
            ## training
            # set model to train mode
            pipeline.model.train()

            # grab batch and map to device
            train_batch = {k: v.to(device) for k, v in train_batch.items()}

            # forward pass
            outputs = pipeline.model(**batch)
            train_loss = outputs.loss

            running_train_loss += train_loss.item()

            # backward pass
            train_loss.backward()
            # accelerator.backward(loss)

            # clip gradients
            torch.nn.utils.clip_grad_norm_(pipeline.model.parameters(), 1.0)

            # update optimizer, scheduler
            optimizer.step()
            lr_scheduler.step()

            # zero gradients
            optimizer.zero_grad()
            
            ## validation
            # set model to eval mode
            pipeline.model.eval()
            # loop through val data
            val_batch = {k: v.to(device) for k, v in val_batch.items()}
            with torch.no_grad():
                outputs = pipeline.model(**batch)
                val_loss = outputs.loss
                running_val_loss += val_loss.item()
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
            
            print(f"Train Batch Loss: {train_loss:.4f} | Val Batch Loss: {val_loss:.4f} | Best Val. Loss: {best_val_loss:.4f}\r", end="")

            i += 1
            pbar.update(1)
            if i % 1000 == 0:

                # # save model checkpoint
                # checkpoint = {
                #     'model': pipeline.model.state_dict(),
                #     'optimizer': optimizer.state_dict(),
                #     'epoch': epoch,
                #     'iter_num': i,``
                #     'best_val_loss': best_val_loss,
                # }
                # torch.save(checkpoint, checkpoint_path.format(i))

                # print example output
                print(f"Batch {i} of {num_batches}; Printing Example Response...")
                print(pipeline(text,
                               max_new_tokens=256,
                               eos_token_id=terminators,
                               do_sample=True,
                               temperature=0.6,
                               top_p=0.9)[0][0]['generated_text'])

            # write to log
            with open(log_path, 'a') as f: 
                f.write(f'{epoch},{i},{train_loss},{val_loss}\n')
            
            if i == num_batches:
                print(f"Reached {num_batches} batches; breaking...")
                break
    
    train_loss = running_train_loss / num_batches
    val_loss = running_val_loss / num_batches

    print("Epoch Complete; Printing example response...")
    print(pipeline(text, max_length=100, truncation=True))

    train_loss = running_train_loss / len(train_dataloader)
    print(f"Avg. Train Loss: {train_loss:.4f}, Avg. Val Loss: {val_loss:.4f}")
    # print("Evaluation metrics:", metric.compute())

print("Training Complete!")

Epoch 1
Training...


  0%|          | 0/10000 [00:00<?, ?it/s]

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Batch 1000 of 10000; Printing Example Response... | Best Val. Loss: 0.7985316
Network biology is the analysis of biological systems using network theory the study of interactions among a set of objects such as genes proteins or the study of the network of connections among proteins in a cell is an example of systems biology network biology is the application of graph theory a sub branch of mathematics to the study of biological systems. The biological network is a complex system that can be studied using the tools of network theory in this review we first present the basic concepts of network theory then we introduce the application of network theory in biological systems including biological networks the network of
Batch 2000 of 10000; Printing Example Response... | Best Val. Loss: 0.1987
Network biology is the study of biological systems from the perspective of complex systems and networks. In contrast to the traditional reductionist approach, systems biology aims to study the system

KeyboardInterrupt: 

In [None]:
# run a test prediction
outputs = pipeline(
    text,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0][0]['generated_text'])

network biology is an interdisciplinary field that investigates complex biological. A systematic approach to the study of biological systems. Network theory. Biological network modeling is an important tool to. The biological network modeling is an important tool to understand. Biological network modeling is an important tool to understand the complex. Biological network modeling is an important tool to understand the complex. Biological network modeling is an important tool to understand the complex. Biological network modeling is an important tool to understand the complex. Biological network modeling is an important tool to understand the complex. Biological network modeling is an important tool to understand the complex. Biological network modeling is an important tool to understand the complex. Biological network modeling is an important tool to understand the complex. Biological network modeling is an important tool to understand the complex. Biological network modeling is an imp