# Experiment 1: Fine Tuning LLama 3 on Medical Datasets
>
![image](https://github.com/Basel-anaya/LoreWeaver/assets/81964452/9ad6eca1-7cc4-4907-a4e6-ae056b8d0b23)

## Fine tuning LLana 3 on Medical Dataset
- This code is an example of fine-tuning the LLaVA model from the PATHVQA dataset.

- The code first imports the required libraries and modules. It then loads the `PATHVQA dataset` using the load_dataset function from the datasets library.

- The dataset is preprocessed using the `CLIPProcessor` to extract features from the images and tokenize the questions and answers. The preprocessed data is then converted to the PyTorch format and stored in the dataset variable.

- The `PATHVQAFineTuner` class is defined, which handles loading the model and tokenizer, preprocessing the image, fine-tuning the model, and saving the model.

- Inside the class, the `load_model_and_tokenizer` method loads the model and tokenizer from the specified directory. The model is wrapped with `DeepSpeed` for distributed training.

- The `preprocess_image` method takes an image path and preprocesses it for model input.

- The `fine_tune` method fine-tunes the model using the provided dataset and training arguments.

- The `save_model` method saves the fine-tuned model and tokenizer.

- The `predict_answer` method takes a question and preprocessed image tensor and generates an answer using the fine-tuned model.

- At the end of the code, an instance of the `PATHVQAFineTuner` class is created, the model and tokenizer are loaded, and the model is fine-tuned using the provided dataset. Finally, the fine-tuned model is saved.

In [0]:
%pip install -U datasets
%pip install -U transformers
%pip install bitsandbytes
%pip install -U peft
%pip install -U trl
%pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121
%pip uninstall -y torch
%pip install torch==2.1.2
%pip install accelerate==0.30.1

### Importing necessary libraries

In [0]:
dbutils.library.restartPython()

In [0]:
import os
import sys
sys.path.append('/Workspace/Users/baselanaya@gmail.com/')
import torch
import torchvision
import requests
from io import BytesIO
from PIL import Image
from datasets import load_dataset
import bitsandbytes as bnb
from huggingface_hub import upload_folder
from torch.utils.data import DataLoader, TensorDataset
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from medformer.custom_optimizer import SophiaG
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          deepspeed,
                          PreTrainedTokenizerFast, 
                          logging)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from trl import ORPOConfig, ORPOTrainer, setup_chat_format, SFTTrainer

### Medical Dataset Loading & Preprocessing

In [0]:
from transformers import GPT2Tokenizer

# Initialize a GPT2Tokenizer instance
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Load the dataset
sft_dataset = load_dataset("medarc/sft_multimedqa", split="all")

def preprocess_text_example(example, input_text_column, target_text_column, tokenizer):
    # Load and preprocess the input text
    input_text = example[input_text_column]
    
    # Tokenize the input text using GPT2Tokenizer
    input_ids = tokenizer.encode(input_text, add_special_tokens=False, return_tensors="pt").squeeze(0)
    
    # Prepare the target text for generation
    target_text = example[target_text_column]
    target_ids = tokenizer.encode(target_text, add_special_tokens=False, return_tensors="pt").squeeze(0)
    
    return {
        "input_ids": input_ids,
        "labels": target_ids
    }

# Map the preprocess_text_example function over the dataset
sft_dataset = sft_dataset.map(
    lambda example: preprocess_text_example(
        example,
        input_text_column="prompt",
        target_text_column="completion",
        tokenizer=tokenizer
    ),
    batched=True,
)

# Set the dataset format for torch
sft_dataset.set_format(type='torch', columns=['input_ids', 'labels'])




### DeepSpeed Configurations (ZeRO-Stage III)

In [0]:
ds_config = {
    "fp16": {
      "enabled": "auto",
      "loss_scale": 0,
      "loss_scale_window": 1000,
      "initial_scale_power": 16,
      "hysteresis": 2,
      "min_loss_scale": 1
    },
    "bf16": {
      "enabled": "auto"
    },
    "optimizer": {
      "type": "AdamW",
      "params": {
        "lr": "auto",
        "betas": "auto",
        "eps": "auto",
        "weight_decay": "auto"
      }
    },
    "scheduler": {
      "type": "WarmupLR",
      "params": {
        "warmup_min_lr": "auto",
        "warmup_max_lr": "auto",
        "warmup_num_steps": "auto"
      }
    },
    "zero_optimization": {
      "stage": 3,
      "offload_optimizer": {
        "device": "cpu",
        "pin_memory": True
      },
      "offload_param": {
        "device": "cpu",
        "pin_memory": True
      },
      "overlap_comm": True,
      "contiguous_gradients": True,
      "sub_group_size": 1e9,
      "reduce_bucket_size": "auto",
      "stage3_prefetch_bucket_size": "auto",
      "stage3_param_persistence_threshold": "auto",
      "stage3_max_live_parameters": 1e9,
      "stage3_max_reuse_distance": 1e9,
      "gather_16bit_weights_on_model_save": True
    },
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "steps_per_print": 1e5,
    "wall_clock_breakdown": False
  }

### Fine Tuning Main Class

In [0]:
class FineTuner:
    def __init__(self, model_name_or_path, output_dir, local_rank=1, deepspeed_config=ds_config, learning_rate=0.001, per_device_train_batch_size=16):
        self.model_name_or_path = model_name_or_path
        self.output_dir = output_dir
        self.local_rank = local_rank
        self.dataset = sft_dataset
        self.learning_rate = learning_rate
        self.weight_decay = 0.001
        self.device = torch.device('cuda', local_rank) if torch.cuda.is_available() and local_rank >= 0 else torch.device('cpu')
        self.per_device_train_batch_size = per_device_train_batch_size

        # Load tokenizer and model with QLoRA configuration
        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=False
        )

    def load_model_and_tokenizer(self):

        # Define training arguments
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            per_device_train_batch_size=8,
            gradient_accumulation_steps=1,
            deepspeed=ds_config,
            optim='lion_8bit' 
        )  

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)

        # Load model
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name_or_path,
            torch_dtype=torch.bfloat16,
            quantization_config=self.bnb_config,
            attn_implementation="flash_attention_2",
        )
        self.model, self.tokenizer = setup_chat_format(self.model, self.tokenizer)
        self.model = prepare_model_for_kbit_training(self.model)

    def fine_tune(
            self,
            dataset,
            output_dir_prefix="./checkpoints/",
            num_train_epochs=2
            per_device_train_batch_size=16,
            gradient_accumulation_steps=4,
            lr_scheduler_type="cosine",
            warmup_ratio=0.03,
            save_steps=25,
            logging_steps=25,
            max_grad_norm=0.3,
            max_seq_length=1024,
            device_map="auto",
            learning_rate=0.001,
            weight_decay=0.001,
            max_steps=-1
    ):
        dataset = sft_dataset
        dataset_text_field = "prompt"

        # Initialize distributed training if enabled
        if self.local_rank != -1:
            torch.distributed.barrier()

        optimizer = bnb.optim.Adam8bit(
            self.model.parameters(),
            lr=learning_rate,
            betas=(0.9, 0.999),
            weight_decay=0.01
        )

        # Define training arguments
        training_arguments = TrainingArguments(
            output_dir=self.output_dir,
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optim='lion_8bit',
            save_steps=save_steps,
            logging_steps=logging_steps,
            bf16=True,
            max_grad_norm=max_grad_norm,
            max_steps=max_steps,
            warmup_ratio=warmup_ratio,
            group_by_length=True,
            lr_scheduler_type=lr_scheduler_type,
            deepspeed=ds_config,
            local_rank=self.local_rank
        )

        # Initialize DoRA configuration
        peft_config = LoraConfig(
                lora_alpha=32,
                lora_dropout=0.05,
                r=16,
                bias="none",
                task_type="CAUSAL_LM",
                target_modules= ["q_proj", "k_proj", "v_proj", "o_proj",
                                 "gate_proj", "up_proj", "down_proj",],
                use_dora=True
        )

        trainer = SFTTrainer(
                model=self.model,
                train_dataset=dataset,
                peft_config=peft_config,
                dataset_text_field="prompt",
                max_seq_length=max_seq_length,
                tokenizer=self.tokenizer,
                args=training_arguments,
                optimizers='lion_8bit',
        )

        # Initialize DeepSpeed
        deepspeed_init(
            trainer=trainer,
            num_training_steps=num_training_steps,
            inference=False,
            optimizer_cls=training_args.optim,
            resume_from_checkpoint=resume_from_checkpoint,
            auto_find_batch_size=auto_find_batch_size,
        )

        trainer.train()

        lora_model = PeftModel.from_pretrained(trainer.model)
        lora_model.save_pretrained(f"{output_dir_prefix}")

        # Save the fine-tuned model
        self.save_model()

    def save_model(self):
        # Save fine-tuned model
        if self.local_rank in [-1, 0]:
            try:
                self.model.save_pretrained(self.output_dir)
                self.tokenizer.save_pretrained(self.output_dir)
                print("Model and tokenizer saved successfully.")
            except Exception as e:
                print(f"Error saving model and tokenizer: {e}")

        # Wait for all processes to finish
        if self.local_rank != -1:
            dist.barrier()

# Fine-tune the model on all datasets
finetuner = FineTuner(model_name_or_path='meta-llama/Meta-Llama-3-8B', output_dir='./checkpoints/finetuned_model', learning_rate=0.001)
finetuner.load_model_and_tokenizer()

# Fine-tune the model
finetuner.fine_tune(
   dataset=sft_dataset,
   output_dir_prefix="./checkpoints/",
)

# Save the fine-tuned model
finetuner.save_model()

### Merging the LoRA Weights

In [0]:
# Load the original LLaMA model
original_llama_model = load_pretrained_model("meta-llama/Meta-Llama-3-8B")

# Load the LoRA weights
adapter = PeftModel.from_pretrained(original_idefics2_model, "Reverb/medllama3-8B")

# Combine the adapters using the add_weighted_adapter method
combined_model = adapter.merge_and_unload()

# Save the final fine-tuned LLaMA model
combined_model.save_pretrained("./final_model/medllama3-8B", is_main_process=True)
tokenizer.save_pretrained("./final_model/medllama3-8B")

# Deploying the final fine-tuned LLaMA model to Hugging Face
upload_folder(
    folder_path="./final_model/medllama3-8B",
    repo_id="Reverb/medllama3-8B",
    repo_type="model",
    commit_message="Upload fine-tuned LLaVA model"
)