In [None]:
!pip install "unsloth[colab] @ git+https://github.com/unslothai/unsloth.git"
!pip install "git+https://github.com/huggingface/transformers.git"
!pip install huggingface-hub xformers
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && make clean && LLAMA_CUBLAS=1 make -j
!pip install gguf protobuf
# !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --verbose

**Setting up ENV**

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = torch.float16
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

**Lora Config**

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Lora Rank ,I would prefer 8-32 for smaller models like 7b
    target_modules = ['v_proj', 'down_proj', 'up_proj', 'o_proj', 'q_proj', 'gate_proj', 'k_proj'],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    max_seq_length = max_seq_length,
    # modules_to_save = ["lm_head", "embed_tokens"] ## if you use new chat formats or embedding tokens
)

Unsloth 2024.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


**Data-Prep**

In [None]:
base_prompt = """
User: I am experincing {}
### Assistant :
These symptoms implies you may have {}"""

def remove_duplicate_lines(docstring):
    lines = docstring.split('\n')
    unique_lines = list(dict.fromkeys(lines))
    cleaned_docstring = '\n'.join(unique_lines)
    return cleaned_docstring

def formatting_prompts_func(damn):
    query = damn["Name"]
    symp = damn["Symptoms"]
    treat = damn["Treatments"]
    separator=". Some of the common treatments include "
    texts = []
    merg = [f"{item1}{separator}{item2}" for item1, item2 in zip(query, treat)]
    for symp,merg in zip(symp,merg):
        text = base_prompt.format(symp,merg)
        text=remove_duplicate_lines(text)
        text=text.replace("Open pop-up dialog box","")
        texts.append(text)
    return { "deta" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("QuyenAnhDE/Diseases_Symptoms", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [None]:
dataset["deta"][0]

**Training-Params**

In [None]:
from transformers import TrainingArguments
epoch=1
batch_size=2
learning_rate=69e-5
args = TrainingArguments(
        per_device_train_batch_size = batch_size,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        learning_rate = learning_rate,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        num_train_epochs=epoch, ## use this for epoch
        # max_steps=9, ## max steps over ride epochs
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    )

***Training Model Using SFT-Trainer***

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "deta",
    max_seq_length = max_seq_length,
    args = args,
)

tokenizer_config.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

**Memory Stats For Training**


---
*Unsloth*


In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.074 GB of memory reserved.


**Training and Stats**

In [None]:
trainer_stats = trainer.train()

Unsloth: `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`


Step,Training Loss
1,2.6368
2,2.9058
3,2.4127
4,1.8608
5,1.1721
6,0.8082
7,0.7625
8,0.5957
9,0.5781
10,0.6073


**Overall Time-Memory Stats**

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

310.2704 seconds used for training.
5.17 minutes used for training.
Peak reserved memory = 5.715 GB.
Peak reserved memory for training = 0.641 GB.
Peak reserved memory % of max memory = 38.751 %.
Peak reserved memory for training % of max memory = 4.346 %.


### **Inferencing**
Lets see has our fine-tuning affected its response

In [None]:
query="Dizziness and Nauseous after smoking a ciggarate"
response=''#keep this empty
base_prompt_2 = """[INST]###System:You are a highly Qualified Doctor.Who responds and tries to resolve patient's all problem;problem can be physical,mental,social or of any type;
you must help the patient at any cost.You always respond as doctor.
###Previous Conversation:
Patient: Hi
Doctor: Hello, CutiePie.
###Current Conversation:
Patient: I am experincing {}
Doctor: These symptoms implies you may have {} """
inputs = tokenizer(
[
    base_prompt_2.format(query,response)
]*1, return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

["<s> [INST]###System:You are a highly Qualified Doctor.Who responds and tries to resolve patient's all problem;problem can be physical,mental,social or of any type;\nyou must help the patient at any cost.You always respond as doctor.\n###Previous Conversation:\nPatient: Hi\nDoctor: Hello, CutiePie.\n###Current Conversation:\nPatient: I am experincing Dizziness and Nauseous after smoking a ciggarate\nDoctor: These symptoms implies you may have   Nicotine Poisoning. Some of the common treatments include Supportive care, activated charcoal administration, gastric decontamination, supportive care, antidote administration [/INST]</s>"]

**For Freeing GPU Cache**

In [None]:
import gc
trainer_stats=None
model=None
gc.collect()
torch.cuda.empty_cache()

**Saving Trained model**

In [None]:
from unsloth import unsloth_save_model

# unsloth_save_model has the same args as model.save_pretrained
unsloth_save_model(model, tokenizer, "output_model", push_to_hub = False, token = None)


**Quantizating**

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!python llama.cpp/convert.py /content/output_model \
        --outfile /content/output_model-unsloth.gguf \
        --outtype f16


In [None]:
!./llama.cpp/quantize ./output_model-unsloth.gguf \
        /content/output_model-unsloth_q4_k_m.gguf q4_k_m