In [1]:
!export HF_HOME=../hf_cache

In [2]:
!git config --global credential.helper store
!pip install accelerate
!git config --global user.name "Neelectric"
!git config --global user.email "Neel.R@web.de"

[0m

In [3]:
import wandb
from wandb import AlertLevel
wandb.init(project="biollama_v2", # the project I am working on
           tags=["hf_sft", "BioLlamaV2"]) # the Hyperparameters I want to keep track of

wandb.alert(
    title="Initialising training run",
    text=f"We have started training",
    level=AlertLevel.WARN,
    wait_duration=300,
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnelectric[0m ([33mneelectric[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
print("imported...")
print(transformers.__version__)

  from .autonotebook import tqdm as notebook_tqdm


imported...
4.39.3


Next, we pull the huggingface token from the local config. Remember to make a copy of config_default.yml, name it config.yml and add your huggingface token. config.yml is in the gitignore so unless you misspell config.yml, it should not get pushed to github.

In [5]:
# token = input(f"Enter token: ")
from box import Box
with open("config.yml", "r") as f:
    config = Box.from_yaml(f.read())
token = config.secrets.hf_token

We also need to verfiy whether the GPU supports bfloat16 (less precision but more range). If not (for example if running on TITAN GPUs), we use float16

In [6]:
if torch.cuda.is_bf16_supported():
    torch_dtype = torch.bfloat16
else:
    torch_dtype = torch.float16
torch_dtype = torch.float32
print(f"Using dtype: {torch_dtype}")

Using dtype: torch.float32


In [7]:
print("started script")

llama_path = "h2oai/h2ogpt-4096-llama2-7b-chat"
# llama_path = "meta-llama/Llama-2-7b-chat-hf"
# llama_path = "TheBloke/Llama-2-7B-Chat-GPTQ"

tokenizer = AutoTokenizer.from_pretrained(llama_path, 
                                          token = token,
                                          cache_dir = "../hf_cache/")
model = AutoModelForCausalLM.from_pretrained(llama_path, 
                                             token = token,
                                            #  device_map = "auto",
                                            device_map = "cuda:0",
                                             torch_dtype = torch_dtype,
                                             cache_dir = "../hf_cache/")

print("model loaded!")

started script


Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.97s/it]

model loaded!





In [8]:
from time import time
max_new_tokens = 150
prompt = "In the era of generative AI, "
medmcqa2 = """
Question.
Low insulin to glucagon ratio is not seen in:
(A) Glycogen synthesis
(B) Glycogen breakdown
(C) Gluconeogenesis
(D) Ketogenesis
Answer. 
"""
tokenized_prompt = tokenizer.encode(medmcqa2, return_tensors = "pt").to('cuda')
time_before = time()
raw_output = model.generate(tokenized_prompt,
                            max_new_tokens = max_new_tokens,
                            temperature = 0.01)
time_after = time()
time_taken = time_after - time_before
untokenized_output = tokenizer.decode(raw_output[0], skip_special_tokens = True)
num_generated = len(raw_output[0]) - len(tokenized_prompt[0])
print(untokenized_output)
print(f"newly generated {num_generated}")
print(f"{num_generated / time_taken} t/s")


Question.
Low insulin to glucagon ratio is not seen in:
(A) Glycogen synthesis
(B) Glycogen breakdown
(C) Gluconeogenesis
(D) Ketogenesis
Answer. 
(D) Ketogenesis

Explanation:
A low insulin to glucagon ratio is typically seen in states of ketosis, where the body is relying on ketones for energy instead of glucose. Therefore, option (D) Ketogenesis is the correct answer.
Glycogen synthesis (A) and glycogen breakdown (B) are processes that occur in the liver and are regulated by insulin and glucagon, respectively. Gluconeogenesis (C) is the process by which the liver and kidneys produce glucose from non-carbohydrate sources, such as amino ac
newly generated 150
20.485244009078365 t/s


In [9]:
import os
path = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + "/pubmed_cleaned/"
print(path)
files = os.listdir(path)
print(files[:5])
# load in the first file and print first 20 lines
# with open(path + files[0], "r") as f:
#     lines = f.readlines()
#     for line in lines[:20]:
#         print(line)
from datasets import load_dataset
print(path + files[0])
dataset = load_dataset("text", 
                       cache_dir = "../hf_cache/",
                       data_files = path + files[0], 
                       split = "train")

/nfs/primary/pubmed_cleaned/
['abs_1_14.tsv', 'abs_1_15.tsv', 'abs_1_23.tsv', 'abs_1_0.tsv', 'abs_1_30.tsv']
/nfs/primary/pubmed_cleaned/abs_1_14.tsv


In [10]:
dataset

Dataset({
    features: ['text'],
    num_rows: 67773
})

In [11]:
def param_count(m):
    params = sum([p.numel() for p in m.parameters()])/1_000_000
    trainable_params = sum([p.numel() for p in m.parameters() if p.requires_grad])/1_000_000
    print(f"Total params: {params:.2f}M, Trainable: {trainable_params:.2f}M")
    return params, trainable_params

for param in model.parameters(): 
    param.requires_grad = True
model.model.embed_tokens.weight.requires_grad_(False);
params, trainable_params = param_count(model)

Total params: 6738.42M, Trainable: 6607.34M


In [12]:
from trl import SFTTrainer
from transformers import TrainingArguments
batch_size = 2
total_num_steps = 50
output_dir = "../"
training_args = TrainingArguments(
    output_dir=output_dir,
    # per_device_train_batch_size=batch_size,
    # per_device_eval_batch_size=batch_size//2,
    # bf16=False,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_steps=total_num_steps // 10,
    num_train_epochs=2,
    # max_steps = -1
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    # evaluation_strategy="steps",
    # eval_steps=total_num_steps // 6,
    # logging strategies
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="epoch", #changed to epoch so we save every epoch i guess?
    save_total_limit=2,
)

In [13]:
def create_prompt(row):
    text = row["text"]
    return text
# create_prompt(dataset[0])

In [14]:
trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    dataset_text_field="text",
    # eval_dataset=test_dataset,
    packing=True,
    max_seq_length=512,
    args=training_args,
    formatting_func=create_prompt,
    # compute_metrics=token_accuracy,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [15]:
#set llama model config use_cache to false!!!
trainer.train()
wandb.finish()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


In [None]:
trainer.save_model(output_dir)
#print contents of output_dir
!ls -l $output_dir
#print full path of output_dir
# !pwd $output_dir