In [1]:
!pip install transformers datasets evaluate peft trl bitsandbytes

import os
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging
from peft import LoraConfig
from trl import SFTTrainer

base_model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
guanaco_dataset = "mlabonne/guanaco-llama2-1k"
new_model = "llama-1.1B-rare_disease"

Collecting datasets
  Using cached datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting trl
  Using cached trl-0.16.0-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Using cached bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Using cached n

In [2]:
# -*- coding: utf-8 -*-
model = AutoModelForCausalLM.from_pretrained(base_model, device_map='auto')
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # pad sequences
tokenizer.padding_side = 'right'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [3]:
# run inference
logging.set_verbosity(logging.CRITICAL)
prompt = "Which phenotypes and genes are associated with cystic fibrosis?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Which phenotypes and genes are associated with cystic fibrosis? [/INST]
[INST] Which phenotypes and genes are associated with cystic fibrosis? [/INST]
[INST] Which phenotypes and genes are associated with cystic fibrosis? [/INST]
[INST] Which phenotypes and genes are associated with cystic fibrosis? [/INST]
[INST] Which phenotypes and genes are associated with cystic fibrosis? [/INST]
[INST] Which phenotypes and genes are associated with cystic fibrosis? [/INST]
[INST] Which phenotypes and genes are associated with cystic fibrosis? [/INST]
[INST] Which phenotypes and genes are associated with cystic fibrosis? [/INST


In [4]:
import pandas as pd
from datasets import load_dataset, Dataset

data_=pd.read_csv('/content/finetuning_datasets.csv')
data_ = data_[['text']]
# Convert Pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(data_)
# Save dataset to disk (optional)
dataset.save_to_disk("formatted_dataset")
# Verify dataset structure
print(dataset)

Saving the dataset (0/1 shards):   0%|          | 0/66427 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 66427
})


In [None]:
dataset = load_dataset(guanaco_dataset, split="train")

Dataset({
    features: ['text'],
    num_rows: 1000
})

In [5]:
peft_params = LoraConfig(lora_alpha=16, # multiplier of Lora output when its added to the full forward output
                         lora_dropout=0.1, # with a probability of 10% it will set random Lora output to 0
                         r=64, # rank of Lora so matrices will have either LHS or RHS dimension of 64
                         bias="none", # no bias term
                         task_type="CAUSAL_LM"
)
training_params = TrainingArguments(output_dir='./results',
                                    num_train_epochs=2, # two passs over the dataset
                                    per_device_train_batch_size=2, #mbs=2
                                    gradient_accumulation_steps=16, # effective batch size 16*2
                                    optim="adamw_torch",
                                    save_steps=25, # checkpoint every 25 steps
                                    logging_steps=1,
                                    learning_rate=2e-4, # step size in the optimizer update
                                    weight_decay=0.001,
                                    fp16=True, # 16 bit
                                    bf16=False, # not supported on V100
                                    max_grad_norm=0.3, #gradient clipping improves convergence
                                    max_steps=-1,
                                    warmup_ratio=0.03, # learning rate warmup
                                    group_by_length=True,
                                    lr_scheduler_type="cosine" # cosine lr scheduler
)

trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    peft_config=peft_params,  # Parameter-efficient fine-tuning (LoRA)
    args=training_params
)


import gc # garbage collection
gc.collect()
torch.cuda.empty_cache() # clean cache

trainer.train() # train the model
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

Converting train dataset to ChatML:   0%|          | 0/66427 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/66427 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/66427 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/66427 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjabervalinejad[0m ([33mjabervalinejad-virginia-tech[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'loss': 1.7646, 'grad_norm': 0.2490529865026474, 'learning_rate': 1.6000000000000001e-06, 'num_tokens': 32768.0, 'mean_token_accuracy': 0.6165689155459404, 'epoch': 0.0004817245739748299}
{'loss': 1.8315, 'grad_norm': 0.18303054571151733, 'learning_rate': 3.2000000000000003e-06, 'num_tokens': 61917.0, 'mean_token_accuracy': 0.6006165780127048, 'epoch': 0.0009634491479496598}
{'loss': 1.8629, 'grad_norm': 0.11139564961194992, 'learning_rate': 4.800000000000001e-06, 'num_tokens': 82412.0, 'mean_token_accuracy': 0.6062610149383545, 'epoch': 0.0014451737219244897}
{'loss': 1.8841, 'grad_norm': 0.10882170498371124, 'learning_rate': 6.4000000000000006e-06, 'num_tokens': 97161.0, 'mean_token_accuracy': 0.6127886660397053, 'epoch': 0.0019268982958993196}
{'loss': 1.9716, 'grad_norm': 0.12197726219892502, 'learning_rate': 8.000000000000001e-06, 'num_tokens': 108585.0, 'mean_token_accuracy': 0.5960244499146938, 'epoch': 0.0024086228698741493}
{'loss': 2.0291, 'grad_norm': 0.12860728800296783, '



{'loss': 1.4018, 'grad_norm': 0.1479865163564682, 'learning_rate': 0.00015105522391568987, 'num_tokens': 7571523.0, 'mean_token_accuracy': 0.6766251251101494, 'epoch': 0.6989823568374782}
{'loss': 1.4802, 'grad_norm': 0.14072781801223755, 'learning_rate': 0.00015098809562068, 'num_tokens': 7600168.0, 'mean_token_accuracy': 0.6590957008302212, 'epoch': 0.699464081411453}
{'loss': 1.3109, 'grad_norm': 0.13849017024040222, 'learning_rate': 0.00015092093626314587, 'num_tokens': 7618949.0, 'mean_token_accuracy': 0.6880845911800861, 'epoch': 0.6999458059854279}
{'loss': 1.3871, 'grad_norm': 0.16090485453605652, 'learning_rate': 0.0001508537458840017, 'num_tokens': 7633109.0, 'mean_token_accuracy': 0.6817996166646481, 'epoch': 0.7004275305594027}
{'loss': 1.4534, 'grad_norm': 0.18882609903812408, 'learning_rate': 0.00015078652452418063, 'num_tokens': 7644326.0, 'mean_token_accuracy': 0.6651064418256283, 'epoch': 0.7009092551333775}
{'loss': 1.4468, 'grad_norm': 0.17211921513080597, 'learning_



{'loss': 1.3944, 'grad_norm': 0.13119511306285858, 'learning_rate': 0.0001259635703851713, 'num_tokens': 9370943.0, 'mean_token_accuracy': 0.6843841709196568, 'epoch': 0.8675859577286686}
{'loss': 1.4845, 'grad_norm': 0.1259888857603073, 'learning_rate': 0.0001258881871583559, 'num_tokens': 9402667.0, 'mean_token_accuracy': 0.6628498286008835, 'epoch': 0.8680676823026434}
{'loss': 1.432, 'grad_norm': 0.11302439123392105, 'learning_rate': 0.00012581278816016427, 'num_tokens': 9425550.0, 'mean_token_accuracy': 0.6809404604136944, 'epoch': 0.8685494068766183}
{'loss': 1.4521, 'grad_norm': 0.11803051084280014, 'learning_rate': 0.00012573737343653024, 'num_tokens': 9442522.0, 'mean_token_accuracy': 0.6635617725551128, 'epoch': 0.8690311314505931}
{'loss': 1.4111, 'grad_norm': 0.14467696845531464, 'learning_rate': 0.00012566194303339739, 'num_tokens': 9455218.0, 'mean_token_accuracy': 0.6774500943720341, 'epoch': 0.869512856024568}
{'loss': 1.4081, 'grad_norm': 0.14070159196853638, 'learning

KeyboardInterrupt: 

In [6]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('llama-1.1B-rare_disease/tokenizer_config.json',
 'llama-1.1B-rare_disease/special_tokens_map.json',
 'llama-1.1B-rare_disease/tokenizer.model',
 'llama-1.1B-rare_disease/added_tokens.json',
 'llama-1.1B-rare_disease/tokenizer.json')

In [7]:
prompt = "Which phenotypes and genes are associated with cystic fibrosis?"
pipe = pipeline(task='text-generation', model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f'<s>[INST] {prompt} [/INST]')
print(result[0]['generated_text'])

<s>[INST] Which phenotypes and genes are associated with cystic fibrosis? [/INST] The following phenotypes and genes are associated with cystic fibrosis based on a review of the literature (additional references were found by searching MEDLINE and PubMed) 1. Abnormality of the pancreas 2. Abnormality of the liver 3. Abnormality of the bile ducts 4. Abnormality of the gallbladder 5. Abnormality of the spleen 6. Abnormality of the kidney 7. Abnormality of the adrenal gland 8. Abnormality of the heart 9. Abnormality of the lungs 10. Abnormality of the lymph nodes 11. Abnormality of the lymphatic system 12. Abnormality of the


In [9]:
prompt = "is there any association between gene CARD9 and familial chronic mucocutaneous?"
pipe = pipeline(task='text-generation', model=model, tokenizer=tokenizer, max_length=500)
result = pipe(f'<s>[INST] {prompt} [/INST]')
print(result[0]['generated_text'])

<s>[INST] is there any association between gene CARD9 and familial chronic mucocutaneous? [/INST] The gene CARD9 is associated with familial chronic mucocutaneous with or without lymphadenopathy based on a MENDELIAN relationship. 
  * {1:<NAME> et al. (2015)} described a new phenotype, familial chronic mucocutaneous with or without lymphadenopathy, based on a MENDELIAN relationship. 
  * {1:<NAME> et al. (2015)} described a new phenotype, familial chronic mucocutaneous with or without lymphadenopathy, based on a MENDELIAN relationship. 
  * {1:<NAME> et al. (2015)} described a new phenotype, familial chronic mucocutaneous with or without lymphadenopathy, based on a MENDELIAN relationship. 
  * {1:<NAME> et al. (2015)} described a new phenotype, familial chronic mucocutaneous with or without lymphadenopathy, based on a MENDELIAN relationship. 
  * {1:<NAME> et al. (2015)} described a new phenotype, familial chronic mucocutaneous with or without lymphadenopathy, based on a MENDELIAN rela