In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

In [3]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,),

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.9.post4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

In [6]:
from datasets import load_dataset
dataset = load_dataset("qiaojin/PubMedQA", "pqa_artificial", split="train[:1000]") # openlifescienceai/medmcqa

README.md:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/233M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/211269 [00:00<?, ? examples/s]

In [7]:
dataset.num_rows

1000

In [8]:
import pprint as pp
pp.pprint(dataset[0]["context"])
print(dataset[0]["question"])

{'contexts': ['Chronic rhinosinusitis (CRS) is a heterogeneous disease with an '
              'uncertain pathogenesis. Group 2 innate lymphoid cells (ILC2s) '
              'represent a recently discovered cell population which has been '
              'implicated in driving Th2 inflammation in CRS; however, their '
              'relationship with clinical disease characteristics has yet to '
              'be investigated.',
              'The aim of this study was to identify ILC2s in sinus mucosa in '
              'patients with CRS and controls and compare ILC2s across '
              'characteristics of disease.',
              'A cross-sectional study of patients with CRS undergoing '
              'endoscopic sinus surgery was conducted. Sinus mucosal biopsies '
              'were obtained during surgery and control tissue from patients '
              'undergoing pituitary tumour resection through transphenoidal '
              'approach. ILC2s were identified as CD45(+) Li

In [9]:
class context:
  def __init__(self, context, labels, meshes=None):
    self.context = context
    self.labels = labels
    self.meshes = meshes
    self.parsed_context = None

  def parse(self):
    parsed_context = str()
    for i in range(len(self.labels)):
      label = self.labels[i]
      if label:
        parsed_context += label + " is " + self.context[i] +  "\n"
      else:
        parsed_context += self.context[i] + "\n"
      self.parsed_context = parsed_context
    return parsed_context

  def parse_with_question(self, context_to_parse, question):
    new_question = context_to_parse + "\n" + " And here is my question for you " + question
    self.parsed_with_question = new_question
    return new_question

  def get_context(self):
    return self.parsed_context

  def __str__(self):
    return self.parsed_context
  def __repr__(self):
    return self.parsed_context()


In [10]:
def transform_ds(example):
  relevant_context = context(example['context']["contexts"], example['context']["labels"])
  context_str = relevant_context.parse()
  question = relevant_context.parse_with_question(context_str, example["question"])
  del relevant_context
  return {
      "conversations": [
          {
              "role": "human",
              "content": question
          },
          {
              "role": "assistant",
              "content": example["long_answer"]
          }
      ]
  }

In [11]:
dataset = dataset.map(transform_ds)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [12]:
dataset[0]["conversations"]

[{'content': 'BACKGROUND is Chronic rhinosinusitis (CRS) is a heterogeneous disease with an uncertain pathogenesis. Group 2 innate lymphoid cells (ILC2s) represent a recently discovered cell population which has been implicated in driving Th2 inflammation in CRS; however, their relationship with clinical disease characteristics has yet to be investigated.\nOBJECTIVE is The aim of this study was to identify ILC2s in sinus mucosa in patients with CRS and controls and compare ILC2s across characteristics of disease.\nMETHODS is A cross-sectional study of patients with CRS undergoing endoscopic sinus surgery was conducted. Sinus mucosal biopsies were obtained during surgery and control tissue from patients undergoing pituitary tumour resection through transphenoidal approach. ILC2s were identified as CD45(+) Lin(-) CD127(+) CD4(-) CD8(-) CRTH2(CD294)(+) CD161(+) cells in single cell suspensions through flow cytometry. ILC2 frequencies, measured as a percentage of CD45(+) cells, were compar

In [13]:
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)

Standardizing format:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [14]:
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [15]:
print(dataset[0]['text'])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

BACKGROUND is Chronic rhinosinusitis (CRS) is a heterogeneous disease with an uncertain pathogenesis. Group 2 innate lymphoid cells (ILC2s) represent a recently discovered cell population which has been implicated in driving Th2 inflammation in CRS; however, their relationship with clinical disease characteristics has yet to be investigated.
OBJECTIVE is The aim of this study was to identify ILC2s in sinus mucosa in patients with CRS and controls and compare ILC2s across characteristics of disease.
METHODS is A cross-sectional study of patients with CRS undergoing endoscopic sinus surgery was conducted. Sinus mucosal biopsies were obtained during surgery and control tissue from patients undergoing pituitary tumour resection through transphenoidal approach. ILC2s were identified as CD45(+) Lin(-) CD127(+) CD4(-) 

In [16]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        max_steps = int(dataset.num_rows / 8),
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [17]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [18]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nBACKGROUND is Hidradenitis suppurativa (HS) is a chronic inflammatory disease involving intertriginous skin. Previous epidemiologic studies have been limited by small sample size.\nOBJECTIVE is We sought to describe the prevalence and comorbidities of HS in a large patient care database.\nMETHODS is In this retrospective case-control study, we chart-validated all patients within a hospital database who received at least 1 billing code for HS between 1980 and 2013. Verified cases were matched with controls based on age, gender, and race. Prevalences of a priori selected comorbidities were compared between HS and control groups.\nRESULTS is A total of 2292 patients at Massachusetts General Hospital received at least 1 code for HS. A total of 1776 cases had a validated diagnosis of HS, yielding a prevalence

In [19]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

'                                                                                                                                                                                                                                                                                                                                \n\nControl subjects were not validated for absence of HS and comorbidity validation was not performed for either group.<|eot_id|>'

In [20]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 125
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.332
2,2.2583
3,2.3168
4,2.1316
5,1.9041
6,2.0372
7,1.2912
8,1.5298
9,1.6217
10,1.4263


TrainOutput(global_step=125, training_loss=1.5669615154266356, metrics={'train_runtime': 1405.8644, 'train_samples_per_second': 0.711, 'train_steps_per_second': 0.089, 'total_flos': 2.2780159813730304e+16, 'train_loss': 1.5669615154266356, 'epoch': 1.0})

In [21]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model)

messages = [
    {"role": "user", "content": "What is fibromyalgia?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is fibromyalgia?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nFibromyalgia syndrome (FMS) is a widespread soft tissue disorder, characterized by widespread muscle pain, fatigue, and tender points. A tender point is an area on the body where pain is elicited by light digital pressure, especially if performed with the index and middle fingers, but only on one side,']

In [22]:
FastLanguageModel.for_inference(model)

messages = [
    {"role": "user", "content": "What is fibromyalgia?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

Fibromyalgia is a common, disabling disorder of unclear etiology. It is characterized by chronic musculoskeletal pain, fatigue, sleep disturbance, and cognitive complaints. Diagnosis is clinical, supported by tender point count (4-13 points) and validated pain questionnaires. It has been reported in up to 8% of women in their fifth and sixth decade of life and 3% of men in the United States. The condition affects women 10 times more frequently than men and its prevalence is increasing as the population ages. The etiology of fibromyalgia is not fully understood, but recent data point towards an inter


In [27]:
import os

In [29]:
model.push_to_hub("mohamedalcafory/PubMed_Llama3.1_Based_model", token = os.environ["HUGGINGFACE_HUB_TOKEN"])

No files have been modified since last commit. Skipping to prevent empty commit.


Saved model to https://huggingface.co/mohamedalcafory/PubMed_Llama3.1_Based_model


In [28]:
tokenizer.push_to_hub("PubMed_Llama3.1_Based_model", token = os.environ["HUGGINGFACE_HUB_TOKEN"])

No files have been modified since last commit. Skipping to prevent empty commit.


In [None]:
model.save_pretrained("/content/drive/MyDrive/GenAI/Pubmed_model") # Local saving
tokenizer.save_pretrained("/content/drive/MyDrive/GenAI/Pubmed_model")

('/content/drive/MyDrive/GenAI/med_model/tokenizer_config.json',
 '/content/drive/MyDrive/GenAI/med_model/special_tokens_map.json',
 '/content/drive/MyDrive/GenAI/med_model/tokenizer.json')