In [None]:
!pip install transformers
!pip install accelerate
!pip install datasets
!pip install bitsandbytes
!pip install peft
!pip install trl

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1
Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfull

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Load Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("TokenBender/code_instructions_122k_alpaca_style", split="train")
dataset

test_size = 0.2

dataset = dataset.train_test_split(test_size=test_size)
dataset

In [None]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [None]:
train_dataset.to_pandas().head(5)

Unnamed: 0,instruction,input,output,id,source,title,clean_text,raw_text,url,overview
0,Answer this question truthfully,What are the recommended first-line treatments...,The recommended first-line treatments for prim...,,,,,,,
1,Answer this question truthfully,What complement deficiency increases the risk ...,"Early complement deficiencies, specifically de...",,,,,,,
2,Answer this question truthfully,"In asthma pathogenesis, what is the role of TH...",TH2 cells play a role in asthma pathogenesis b...,,,,,,,
3,,,,4118910a1b7d55d54d67d03e214040c74e3bb0fe,pubmed,Providing care for older adults in the Emergen...,Providing care for older adults in the Emergen...,,https://link.springer.com/content/pdf/10.1007/...,
4,Answer this question truthfully,Which form of Raynaud's phenomenon is characte...,The form of Raynaud's phenomenon that is chara...,,,,,,,


### Load Tokenizer

In [None]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

### Load Model

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_quant_type="nf4",
                                bnb__4bit_compute_dtype=torch.bfloat16)

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"": 0})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def get_answer(query: str, model, tokenizer) -> str:
    device = "cuda"

    prompt_template = """
    [INST]
    Below is an instruction that describes a task. Write a response that appropriately completes the request.
    {query}
    [/INST]
    """

    prompt = prompt_template.format(query=query)

    encodeds = tokenizer(prompt,
                         return_tensors="pt",
                         add_special_tokens=True)

    model_inputs = encodeds.to(device)


    generated_ids = model.generate(**model_inputs,
                                   max_new_tokens=1000,
                                   do_sample=True,
                                   pad_token_id=tokenizer.eos_token_id)

    decoded = tokenizer.batch_decode(generated_ids)
    return (decoded[0])

In [None]:
query="EPIDEMIOLOGY & DEMOGRAPHICS of abdominal Abdominal Aortic Aneurysm ?"

result = get_answer(query=query,
                    model=model,
                    tokenizer=tokenizer)

print(result)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


<s> 
    [INST]
    Below is an instruction that describes a task. Write a response that appropriately completes the request.
    EPIDEMIOLOGY & DEMOGRAPHICS of abdominal Abdominal Aortic Aneurysm ?
    [/INST]
    </s>
    Thank you for your request. I have found some information regarding the epidemiology and demographics of abdominal aortic aneurysms.

    An abdominal aortic aneurysm (AAA) is a bulge in the wall of the aorta, which is the main artery that supplies blood to the rest of the body from the heart. The risk of developing an AAA increases with age, particularly after the age of 65, and is more likely to occur in men than women. It also tends to run in families, so people with a family history of AAA should be especially vigilant about having their aorta screened.

    It is estimated that about 5-10% of all people with an AAA have a family history of this condition. Other factors that can increase the risk of an AAA include smoking, high blood pressure, high-fat diet, low

### Generate Prompt Format

In [None]:
def format_prompt(data_point):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenzed prompt
    """

    prefix_text = 'Below is an instruction that describes a task. Write a response that ' \
                  'appropriately completes the request.\n\n'

    # Samples with additional context into.
    if data_point["input"]:
        text = f"""[INST]{prefix_text} Context: {data_point["instruction"]} Question: {data_point["input"]} [/INST]{data_point["output"]}"""
    # Without
    else:
        text = f"""[INST]{prefix_text} Context: {data_point["instruction"]} [/INST]{data_point["output"]}"""

    Samples with additional context into.

    return text


In [None]:
def generate_prompt(dataset, tokenizer):
    # add the "prompt" column in the dataset
    text_column = [format_prompt(data_point) for data_point in dataset]
    dataset = dataset.add_column("prompt", text_column)

    dataset = dataset.shuffle(seed=2)
    dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

    return dataset

In [None]:
train_dataset = generate_prompt(train_dataset, tokenizer)
test_dataset = generate_prompt(test_dataset, tokenizer)

Flattening the indices:   0%|          | 0/78478 [00:00<?, ? examples/s]

Map:   0%|          | 0/78478 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/19620 [00:00<?, ? examples/s]

Map:   0%|          | 0/19620 [00:00<?, ? examples/s]

In [None]:
print(test_dataset)

Dataset({
    features: ['instruction', 'input', 'output', 'id', 'source', 'title', 'clean_text', 'raw_text', 'url', 'overview', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 19620
})


In [None]:
print(test_dataset['prompt'][0])

[INST]
    Answer this medical guidelines in english.
    If no answer: "i do not know."
    

     Context: None [/INST]
None


### Apply Lora

In [None]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


In [None]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )

In [None]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if "lm_head" in lora_module_names: # Needed for 16bit
            lora_module_names.remove("lm_head")

    return list(lora_module_names)


In [None]:
modules = find_all_linear_names(model)
print(modules)

['gate_proj', 'up_proj', 'v_proj', 'down_proj', 'q_proj', 'o_proj', 'k_proj']


In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(r=8,
                         lora_alpha=32,
                         target_modules=modules,
                         lora_dropout=0.05,
                         bias="none",
                         task_type="CAUSAL_LM")

model = get_peft_model(model, lora_config)

In [None]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 20971520 | total: 7262703616 | Percentage: 0.2888%


### Define Training Arguments

In [None]:
from transformers import TrainingArguments

output_dir = "FineTune_Mistral-7B-Instruct-v0.1"

training_arguments = TrainingArguments(output_dir=output_dir,
                                       warmup_steps=1,
                                       per_device_train_batch_size=20,
                                       per_device_eval_batch_size=20,
                                       gradient_accumulation_steps=10,
                                       gradient_checkpointing=True,
                                       num_train_epochs=1,            # 10 takes too long 71 hours
                                       learning_rate=2.5e-4,          # Want a small lr for finetuning
                                       lr_scheduler_type="cosine",
                                       bf16=True,
                                       optim="paged_adamw_8bit",
                                       logging_steps=100,             # When to start reporting loss
                                       logging_dir="./logs",          # Directory for storing logs
                                       save_strategy="epoch",         # Save the model checkpoint every logging step
                                       save_steps=100,                # Save checkpoints every 50 steps
                                       eval_steps=100,                # Evaluate and save checkpoints every 50 steps
                                       evaluation_strategy="steps",   # Evaluate the model every logging step
                                       # do_eval=True,                # Perform evaluation at the end of training
                                       # max_steps=100,
                                       # push_to_hub=True
                                       )


### Define Data Collator

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

### Define Trainer

In [None]:
# trainer = Trainer(model=model,
#                   train_dataset=train_dataset,
#                   eval_dataset=test_dataset,
#                   args=training_arguments,
#                   data_collator=data_collator)

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(model=model,
                     train_dataset=train_dataset,
                     eval_dataset=test_dataset,
                     dataset_text_field="prompt",
                     peft_config=lora_config,
                     args=training_arguments,
                     data_collator=data_collator)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss
100,0.8647,0.794749
200,0.7845,0.753923
300,0.745,0.729647


Step,Training Loss,Validation Loss
100,0.8647,0.794749
200,0.7845,0.753923
300,0.745,0.729647


TrainOutput(global_step=392, training_loss=0.7837686538696289, metrics={'train_runtime': 31460.9627, 'train_samples_per_second': 2.494, 'train_steps_per_second': 0.012, 'total_flos': 2.1113620444412314e+18, 'train_loss': 0.7837686538696289, 'epoch': 1.0})

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
new_model = "/content/drive/MyDrive/Mistral-7B-Instruct-v0.1-Medical-Finetune"

In [None]:
trainer.model.save_pretrained(new_model)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from peft import PeftModel

model_id = "mistralai/Mistral-7B-Instruct-v0.1"
new_model = "/content/drive/MyDrive/Mistral-7B-Instruct-v0.1-Medical-Finetune"

tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

base_model = AutoModelForCausalLM.from_pretrained(model_id,
                                                  low_cpu_mem_usage=True,
                                                  return_dict=True,
                                                  torch_dtype=torch.float16,
                                                  device_map={"":0})

merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model = merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model", safe_serialiaztion=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
merged_model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRM

In [None]:
# Push the model and tokenizer to the Hugging Face Model Hub

new_model = "Mistral-7B-Instruct-v0.1-Medical-Finetune"

merged_model.push_to_hub(new_model)#, use_temp_dir=False)
tokenizer.push_to_hub(new_model)#, use_temp_dir=False)

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/TachyHealthResearch/Mistral-7B-Instruct-v0.1-Medical-Finetune/commit/6513b1f354f2b1836135529fb20765a3058f733c', commit_message='Upload tokenizer', commit_description='', oid='6513b1f354f2b1836135529fb20765a3058f733c', pr_url=None, pr_revision=None, pr_num=None)

### Evaluation

In [None]:
def get_answer(query: str, model, tokenizer) -> str:
  device = "cuda"

  prompt_template = """
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  [/INST]
  """
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)

  generated_ids = merged_model.generate(**model_inputs,
                                        max_new_tokens=1000,
                                        do_sample=True,
                                        pad_token_id=tokenizer.eos_token_id)

  decoded = tokenizer.batch_decode(generated_ids)
  return (decoded[0])



In [None]:
query="EPIDEMIOLOGY & DEMOGRAPHICS of abdominal Abdominal Aortic Aneurysm ?"

result = get_answer(query=query,
                    model=merged_model,
                    tokenizer=tokenizer)
print(result)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


<s> 
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  EPIDEMIOLOGY & DEMOGRAPHICS of abdominal Abdominal Aortic Aneurysm ?
  [/INST]
  </s>
The following table summarizes data related to epidemiology among patients with aneurysms and ruptures and demographic, risk and causes factors. 
Patient Gender Total number of patients 11,588 160 Patients of all ages Total number of deaths by AAD in all ages Total number of deaths due to AAD in patients of all ages Total number of deaths in patients older than or equal to 60 years Total number of deaths in all age patients
Female 5,563 141 1.3 2,109 150 0.9 
Male 6,025 159 1.1 5,379 235 0.9 
There is limited data on the epidemiology and demographics of women with Aneurysms. A review of literature published up to 2007 reported 19 cases of abdominal aortic aneurysm in women . Another review published in 2014 reported that the majority of the AAA cases were female .
A review of the