# **Mistral Model Fine Tuning**

In [None]:
!pip install -U trl peft bitsandbytes transformers datasets accelerate evaluate bert-score rouge_score --quiet


**<h1>Imports & Login</h1>**

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import torch
from datasets import load_dataset
import bitsandbytes as bnb
from trl import SFTTrainer, SFTConfig
from evaluate import load
from bert_score import score as bert_score
from tqdm import tqdm
import gc

In [3]:
from huggingface_hub import login
login(token="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")


# **Model Implementation**

In [4]:
base_model = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(base_model, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(base_model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [5]:
dataset = load_dataset("haistudy/en_law_qa", split="train")
instruction = "You are a Law Assistant. Please answer the following question."

def format_chat(row):
    row["text"] = f"[INST] {instruction} {row['Question']} [/INST] {row['Answer']}"
    return row

dataset = dataset.map(format_chat)
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]


README.md:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

data%20UTF-8.csv:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5560 [00:00<?, ? examples/s]

Map:   0%|          | 0/5560 [00:00<?, ? examples/s]

# **LoRA (Low-Rank Adaptation)**

In [6]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names.add(name.split('.')[-1])
    return list(names - {"lm_head"})

target_modules = find_all_linear_names(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules
)

model = get_peft_model(model, lora_config)


# **Training Model**

In [7]:
tokenizer.truncation_side = "left"
tokenizer.model_max_length = 1024

training_args = SFTConfig(
    output_dir="mistral-law-finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    num_train_epochs=1,
    optim="paged_adamw_32bit",
    fp16=True,
    group_by_length=True,
    report_to="none",
    max_length=512,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_args,
    peft_config=lora_config,
)



Converting train dataset to ChatML:   0%|          | 0/5004 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/5004 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5004 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/5004 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/556 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/556 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/556 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/556 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()



Step,Training Loss
500,0.6464


**<h1>Evaluation</h1>**

In [9]:
def simple_f1(pred, label):
    pred_tokens = pred.split()
    label_tokens = label.split()
    common = set(pred_tokens) & set(label_tokens)
    if not common: return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(label_tokens)
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0.0

def evaluate_model(model, tokenizer, dataset, max_new_tokens=60, batch_size=8):
    model.eval()
    rouge = load("rouge")
    results, f1s, preds, labels = [], [], [], []

    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset[i:i+batch_size]
        tokenizer.padding_side = "left"
        tokenizer.pad_token = tokenizer.eos_token
        inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True,
                           max_length=tokenizer.model_max_length).to(model.device)

        with torch.no_grad():
            outputs = model.generate(input_ids=inputs["input_ids"],
                                     attention_mask=inputs["attention_mask"],
                                     max_new_tokens=max_new_tokens, do_sample=False)

        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        decoded_labels = batch["Answer"]

        for pred, label in zip(decoded_preds, decoded_labels):
            pred, label = pred.strip(), label.strip()
            preds.append(pred)
            labels.append(label)
            f1s.append(simple_f1(pred, label))
            results.append({"prediction": pred, "reference": label, "f1": f1s[-1]})

        del inputs, outputs
        torch.cuda.empty_cache()
        gc.collect()

    rougeL = rouge.compute(predictions=preds, references=labels)["rougeL"]
    _, _, bert_F1 = bert_score(preds, labels, lang="en", device="cpu")

    print(f"\nAverage F1 Score:  {sum(f1s)/len(f1s):.4f}")
    print(f"ROUGE-L Score:     {rougeL:.4f}")
    print(f"BERTScore (F1):    {bert_F1.mean().item():.4f}")

    return {
        "f1": sum(f1s) / len(f1s),
        "rougeL": rougeL,
        "bertscore_f1": bert_F1.mean().item(),
        "results": results
    }

# Run evaluation
metrics = evaluate_model(model, tokenizer, eval_dataset)




Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

  0%|          | 0/70 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|▏         | 1/70 [00:03<04:22,  3.80s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  3%|▎         | 2/70 [00:15<09:47,  8.63s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  4%|▍         | 3/70 [00:19<07:04,  6.33s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  6%|▌         | 4/70 [00:23<05:48,  5.27s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  7%|▋         | 5/70 [00:27<05:14,  4.84s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  9%|▊         | 6/70 [00:30<04:33,  4.27s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 10%|█         | 7/70 [00:34<04:26,  4.22s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 11%|█▏        | 8/70 [00:37<04:02,  3.91s/it]Setting `pad_token_id` to `eos_token_id`:2 for ope

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Average F1 Score:  0.6402
ROUGE-L Score:     0.6960
BERTScore (F1):    0.9336


# **Model selection:**
**Minstral Model:**

lighter than GPT-3 (175B) while retaining high capability.
Compatible with parameter-efficient tuning methods like:

LoRA (Low-Rank Adaptation)

QLoRA (Quantized LoRA)

PEFT (Parameter-Efficient Fine-Tuning)

Available via:
Hugging Face


# **Dataset Preparation:**
**haistudy/en_law_qa**

The dataset is tailored for legal QA tasks, which makes it highly

relevant for applications in law, legal education, or compliance.

# **Training Configuration:**

General Settings
Model Base: [mistralai/Mistral-7B-v0.1] or another LLM such as LLaMA, Falcon, etc.

Fine-Tuning Method:
(LoRA is recommended for Mistral due to lower compute and memory requirements)

# **Compute Used:**

**F1 Score:**Measures how much overlap there is between the tokens (words or subwords) in the generated text vs reference text.

**ROUGE-L (Longest Common Subsequence)**:Measures how much the longest matching sequence of words exists between prediction and reference.

**BERTScore (Semantic Similarity):**Uses BERT embeddings to check how semantically similar the generated text is to the reference — even if they use different words.
