# Connect to google drive for loading the dataset

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Install unsloth

In [20]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"


# Initialize the model

## Quantization
1. Quantization Configuration
- The line load_in_4bit = True indicates that the model is begin loaded with 4-bit quantization
- The model weights are represented in a 4-bit format to reduce memory usage and speed-up computation

2. Impact of Quantization:
- Reduces the memory footprint of the model significantly, especially for large models like Meta-Llama-3.1-8B
- Improves inference speed while slightly sacrificing numerical precision

## LoRA
1. LoRA Applied via get_peft_model:
- The method FastLanguageModel.get_peft_model applies LoRA (Low-Rank Adaptation) to the model 
- Key parameters:
    + r=16: Specifies the rank of the low-rank matrices introduced in certain layers
    + target_modules: Defines the model layers where LoRA is applied (e.g. query, key, value projections, etc)
    + lora_alpha=16: controls the scaling factor for LoRA updates.
    + lora_dropout=0: no dropout is applied to the low-rank metrics during training.
    + bias="none": indicates that no bias terms are added to the model
2. Purpose of LoRA:
- Enables task-specific fine-tuning without updating the entire model's weights.
- Only a small set of trainable parameters (low-rank matrices) is added to specific layers. 


In [21]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
)


==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


# Preparing data for training

In [22]:
from datasets import load_dataset, Dataset
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

def load_multiple_datasets(file_paths):
    datasets = []
    for file_path in file_paths:
        dataset = load_dataset('json', data_files=file_path, split='train')
        datasets.append(dataset)
    return datasets

def create_conversations(datasets):
    conversations = []
    for dataset in datasets:
        conversations.extend(
            [{"role": "user", "content": instr}, {"role": "assistant", "content": output}]
            for instr, output in zip(dataset["instruction"], dataset["output"])
        )
    return {"conversations": conversations}

def formatting_prompts_func(examples):
    texts = [tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=False) for conv in examples["conversations"]]
    return {"text": texts}

file_paths = [
    '/content/drive/MyDrive/LLM project/FAQ_SCHOOL.json',
    '/content/drive/MyDrive/LLM project/FAQ_MEUH.json',
    '/content/drive/MyDrive/LLM project/FAQ_DL.json',
    '/content/drive/MyDrive/LLM project/FAQ_DECISION_ANALYSIS.json',
    '/content/drive/MyDrive/LLM project/FAQ_MATHS_ML.json',
    '/content/drive/MyDrive/LLM project/FAQ_ADVANCED_ML.json',
    '/content/drive/MyDrive/LLM project/FAQ_PRO_CONTRACT.json',
    '/content/drive/MyDrive/LLM project/GEN_CONVERSATION.json',

]

datasets = load_multiple_datasets(file_paths)
all_conversations = create_conversations(datasets)

from datasets import Dataset
conversation_dataset = Dataset.from_dict(all_conversations)

formatted_dataset = conversation_dataset.map(formatting_prompts_func, batched=True)


Map:   0%|          | 0/205 [00:00<?, ? examples/s]

# Training

In [23]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        num_train_epochs=5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        # max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)

tokenizer.decode(trainer.train_dataset[5]["input_ids"])
space = tokenizer(" ", add_special_tokens=False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])
trainer_stats = trainer.train()
model.save_pretrained('finetuned_model')
tokenizer.save_pretrained('finetuned_model')
torch.cuda.empty_cache()


Map (num_proc=2):   0%|          | 0/205 [00:00<?, ? examples/s]

Map:   0%|          | 0/205 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 205 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 125
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,3.09
2,2.9104
3,2.2619
4,2.7941
5,2.2559
6,1.841
7,2.2549
8,2.1109
9,2.214
10,2.2548


# Integrate the model with Gradio interface

In [27]:
!pip install --upgrade gradio
import gradio as gr
import csv
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")
model = FastLanguageModel.for_inference(model)
def generate_response(user_input):
    try:
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device)

        messages = [{"role": "user", "content": user_input}]
        print("User Input:", user_input)
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        )
        if isinstance(inputs, dict):
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)
        elif isinstance(inputs, torch.Tensor):
            input_ids = inputs.to(device)
            attention_mask = None
        else:
            raise ValueError("Unexpected input type returned by tokenizer.")
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=256,
            use_cache=True,
            temperature=1.0,
            top_p=0.9,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
        )
        response = tokenizer.batch_decode(
            outputs[:, input_ids.shape[1]:],
            skip_special_tokens=True
        )[0]
        return response.strip()

    except Exception as e:
        error_message = f"Error: {str(e)}"
        print(error_message)
        return error_message

iface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(lines=5, label="Enter your question"),
    outputs=gr.Textbox(label="Model Response"),
    title="Fine-Tuned LLM Chatbot",
    description="Ask any question, and the fine-tuned LLM will respond.",
)

iface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://06e3e263cfcfe54bbd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


