In [1]:
# import necessary packages
import sys
import torch 
import numpy as np
from accelerate import Accelerator
from peft import get_peft_model, LoraConfig, TaskType
from functools import partial
from importlib import reload
from transformers import (pipeline,
                          DataCollatorWithPadding,
                          get_scheduler)
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from IPython.display import clear_output

sys.path.append('../')

# custom modules
import utils.preprocessing as pp

# Instantiate Model and Dataset

In [2]:
# options
model_path = "meta-llama/Meta-Llama-3-8B"
dataset_path = "allenai/peS2o"

# for distributed training
accelerator = Accelerator()

# for PEFT
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

# load tokenizer and model
pipeline = pipeline('text-generation', 
                    model=model_path,
                    model_kwargs={'torch_dtype': torch.bfloat16},
                    device_map = accelerator.device
                    )

pipeline.model = get_peft_model(pipeline.model, peft_config)
pipeline.tokenizer.pad_token = pipeline.tokenizer.eos_token
pipeline.tokenizer.pad_token_id = pipeline.tokenizer.eos_token_id
pipeline.model.generation_config.pad_token_id = pipeline.tokenizer.eos_token_id

pipeline.model.print_trainable_parameters()

In [None]:
# load dataset
raw_dataset = load_dataset(dataset_path, "v2", streaming=True, trust_remote_code=True)

# check format of data
raw_dataset

IterableDatasetDict({
    train: IterableDataset({
        features: ['added', 'created', 'id', 'source', 'text', 'version'],
        n_shards: 20
    })
    validation: IterableDataset({
        features: ['added', 'created', 'id', 'source', 'text', 'version'],
        n_shards: 2
    })
})

# Preprocessing

In [None]:
reload(pp)
# add special tokens to tokenizer
pipeline.tokenizer.pad_token = pipeline.tokenizer.eos_token
pipeline.model.resize_token_embeddings(len(pipeline.tokenizer))

tokenize_fn = partial(pp.tokenize_data, 
                      type = "nextchar",
                      pipeline_name = pipeline,
                      max_length = 100)

tokenized_dataset = raw_dataset.map(tokenize_fn,
                                    batched=True,
                                    remove_columns=raw_dataset['train'].column_names,)
tokenized_dataset.with_format("torch")

# check tokenized dataset output
tokenized_dataset["train"]

IterableDataset({
    features: Unknown,
    n_shards: 20
})

# Create Dataloaders

In [None]:
# instantiate data collator
data_collator = DataCollatorWithPadding(tokenizer=pipeline.tokenizer)

train_dataloader = DataLoader(tokenized_dataset['train'],
                              batch_size=8, 
                              collate_fn=data_collator,
                              num_workers=20)

val_dataloader = DataLoader(tokenized_dataset['validation'],
                            batch_size=8,
                            collate_fn=data_collator,
                            num_workers=2)

In [None]:
# inspect sample batch
batch = next(iter(train_dataloader))
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 100]), 'attention_mask': torch.Size([8, 100])}

In [None]:
outputs = pipeline.model(batch['input_ids'].to(accelerator.device), labels=batch['input_ids'].to(accelerator.device), attention_mask=batch['attention_mask'].to(accelerator.device))
print(outputs.loss, outputs.logits.shape)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


tensor(2.1162, device='cuda:0', grad_fn=<NllLossBackward0>) torch.Size([8, 100, 128256])


# Training

In [None]:
# run a test prediction
text = ["Systems biology"]

terminators = [
    pipeline.tokenizer.eos_token_id
]

outputs = pipeline(
    text,
    max_new_tokens=100,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)

print(outputs[0][0]['generated_text'])

Systems biology of cancer stem cells
The cancer stem cell (CSC) hypothesis posits that a subset of cancer cells possesses stem cell-like properties and is responsible for tumour initiation, maintenance and relapse. This hypothesis has been the subject of intense scrutiny over the past decade, and has been the focus of numerous reviews. Here, we provide an overview of the evidence supporting the CSC hypothesis, including the identification of putative CSC markers, the development of CSC models, and the characterization of CSCs in various


In [None]:
# options
num_batches = 1_000
num_epochs = 10
best_val_loss = np.inf
checkpoint_path = '../checkpoints/checkpoint_{0}.pt'
log_path = '../logs/log.csv'

# init optimizer
optimizer = AdamW(pipeline.model.parameters(), lr=1e-5)

# init scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=1000,
    num_training_steps=num_epochs * num_batches,
)

pipeline.model, optimizer, train_dataloader, val_dataloader, lr_scheduler = accelerator.prepare(
    pipeline.model, optimizer, train_dataloader, val_dataloader, lr_scheduler)

with open(log_path, 'w') as f: 
    f.write(f'epoch,iter_num,train_loss,val_loss\n')

# loop
for epoch in range(num_epochs):
    
    clear_output(wait=True)

    running_train_loss = 0.0
    running_val_loss = 0.0

    print("=====================")
    print(f"Epoch {epoch + 1}")
    print("=====================")

    # loop through train data
    print("Training...")
    with tqdm(total=num_batches) as pbar:
        for i, (train_batch, val_batch) in enumerate(zip(train_dataloader, val_dataloader)):
            
            ## training
            # set model to train mode
            pipeline.model.train()

            # grab batch and map to device
            train_batch = {k: v.to(accelerator.device) for k, v in train_batch.items()}

            # forward pass
            outputs = pipeline.model(train_batch['input_ids'], 
                                     labels=train_batch['input_ids'],
                                     attention_mask=train_batch['attention_mask'])
            train_loss = outputs.loss

            running_train_loss += train_loss.item()

            # backward pass
            # train_loss.backward()
            accelerator.backward(train_loss)

            # clip gradients
            torch.nn.utils.clip_grad_norm_(pipeline.model.parameters(), 1.0)

            # update optimizer, scheduler
            optimizer.step()
            lr_scheduler.step()

            # zero gradients
            optimizer.zero_grad()
            
            ## validation
            # set model to eval mode
            pipeline.model.eval()
            # loop through val data
            val_batch = {k: v.to(accelerator.device) for k, v in val_batch.items()}
            with torch.no_grad():
                outputs = pipeline.model(val_batch['input_ids'], 
                                         labels=val_batch['input_ids'],
                                         attention_mask=val_batch['attention_mask'])
                val_loss = outputs.loss
                running_val_loss += val_loss.item()
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
            
            print(f"Train Batch Loss: {train_loss:.4f} | Val Batch Loss: {val_loss:.4f} | Best Val. Loss: {best_val_loss:.4f}\r", end="")

            pbar.update(1)
            
            # write to log
            with open(log_path, 'a') as f: 
                f.write(f'{epoch},{i},{train_loss},{val_loss}\n')
            
            if i == num_batches:
                print(f"Reached {num_batches} batches; starting next epoch...")
                
                # break out of batching loop
                break
    
    train_loss = running_train_loss / num_batches
    val_loss = running_val_loss / num_batches
    train_loss = running_train_loss / num_batches
    print(f"Avg. Train Loss: {train_loss:.4f}, Avg. Val Loss: {val_loss:.4f}")

# print example output
print(f"Training Complete; Printing Example Response...")
print(pipeline(text,
                max_new_tokens=256,
                eos_token_id=terminators,
                no_repeat_ngram_size=3,       
                do_sample=True, 
                top_k=100, 
                top_p=0.9,
                temperature=0.6)[0][0]['generated_text'])
    

print(f"Saving model checkpoint to {checkpoint_path.format(i)}")
# save model checkpoint
checkpoint = {'model': pipeline.model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
                'iter_num': i,
                'best_val_loss': best_val_loss,
            }
torch.save(checkpoint, checkpoint_path.format(i))

print("Training Complete!")

Epoch 2
Training...


  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
# run a test prediction
outputs = pipeline(
    text,
    max_new_tokens=1024,
    eos_token_id=terminators,
    no_repeat_ngram_size=3,       
    do_sample=True, 
    top_k=100, 
    top_p=0.9,
    temperature=0.6
)
print(outputs[0][0]['generated_text'])

Systems biology is the study of biological systems.

Systems biology, a new approach to the study and understanding of biological processes, is emerging as a powerful tool for investigating the dynamic properties of biological networks. The field of systems biology is based on the integration of high-throughput data with computational analysis, which enables the development of models that can be used to make predictions about the behavior of biological entities. The goal of systems biologists is to understand the complex interactions between genes, proteins, and other molecules that drive the behavior and function of cells. This approach is particularly useful in understanding how cells respond to external stimuli and how diseases develop. Systems biology has the potential to revolutionize the way we study and understand biological processes. By integrating data from multiple sources, systems biologist can gain a more complete picture of the underlying mechanisms that drive biological 