In [4]:
!pip install -q -U bitsandbytes datasets accelerate peft transformers

In [5]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["CUDA_MODULE_LOADING"] = "LAZY"

import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, TaskType


E0000 00:00:1746627802.673266     239 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746627802.681320     239 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
cuad = load_dataset("json", data_files="/kaggle/input/cuadv1/CUADv1.json")["train"]
sara = load_dataset("jhu-clsp/SARA", split="train")

def preprocess_cuad(dataset):
    out = []
    for item in dataset:
        for entry in item["data"]:
            for p in entry["paragraphs"]:
                ctx = p["context"]
                for qa in p["qas"]:
                    if qa["answers"]:
                        out.append({
                            "question": qa["question"],
                            "context": ctx,
                            "answer": qa["answers"][0]["text"]
                        })
    return out

cuad_data = preprocess_cuad(cuad)
sara_data = [
    {"question": x["question"], "context": x["text"], "answer": x["answer"]}
    for x in sara
]

all_examples = cuad_data + sara_data
dataset = Dataset.from_list(all_examples)
print(f"Total examples: {len(dataset)}")


Total examples: 6958


In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device_idx = device.index if device.type == "cuda" else None

model_id = "NousResearch/Hermes-2-Pro-Mistral-7B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"": device_idx},   # ← all layers → cuda:0
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_config).to(device)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
def format_example(ex):
    prompt = (
        f"### Question:\n{ex['question']}\n"
        f"### Context:\n{ex['context']}\n"
        "### Answer:\n"
    )
    full = prompt + ex["answer"]
    tok = tokenizer(full, truncation=True, max_length=512, padding="max_length")
    input_ids = tok["input_ids"]
    attn_mask = tok["attention_mask"]

    p_tok = tokenizer(prompt, truncation=True, max_length=len(input_ids))["input_ids"]
    p_len = len(p_tok)

    labels = input_ids.copy()
    for i in range(p_len):
        labels[i] = -100

    return {"input_ids": input_ids, "attention_mask": attn_mask, "labels": labels}

tokenized = dataset.map(format_example, remove_columns=dataset.column_names)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


Map:   0%|          | 0/6958 [00:00<?, ? examples/s]

In [9]:
training_args = TrainingArguments(
    output_dir="./legal_qa_lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=2,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    report_to="none",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,0.7935
20,0.6672
30,0.6085
40,0.6227
50,0.555
60,0.5692
70,0.5433
80,0.5132
90,0.5483
100,0.5162


TrainOutput(global_step=870, training_loss=0.3099334276955703, metrics={'train_runtime': 9447.1943, 'train_samples_per_second': 1.473, 'train_steps_per_second': 0.092, 'total_flos': 3.042773538428682e+17, 'train_loss': 0.3099334276955703, 'epoch': 2.0})

In [10]:
model.save_pretrained("./final_legal_lora")
tokenizer.save_pretrained("./final_legal_lora")


('./final_legal_lora/tokenizer_config.json',
 './final_legal_lora/special_tokens_map.json',
 './final_legal_lora/tokenizer.model',
 './final_legal_lora/added_tokens.json',
 './final_legal_lora/tokenizer.json')

In [11]:
from huggingface_hub import login

login(token="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")

tokenizer.push_to_hub("NishKook/legal-qa-lora", use_auth_token=True)
model.push_to_hub("NishKook/legal-qa-lora", use_auth_token=True)




tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NishKook/legal-qa-lora/commit/9a1e9dbb5660575cbc49cd84abaae9b14421f5cd', commit_message='Upload model', commit_description='', oid='9a1e9dbb5660575cbc49cd84abaae9b14421f5cd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/NishKook/legal-qa-lora', endpoint='https://huggingface.co', repo_type='model', repo_id='NishKook/legal-qa-lora'), pr_revision=None, pr_num=None)

In [5]:
import torch
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

torch.cuda.empty_cache()

login(token="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")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device_idx = device.index if device.type == "cuda" else None

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer_inf = AutoTokenizer.from_pretrained(
    "NishKook/legal-qa-lora",
    use_auth_token=True
)

base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    quantization_config=bnb_config,
    device_map={"": device_idx},
    torch_dtype=torch.float16,
    use_auth_token=True
)

model_inf = PeftModel.from_pretrained(
    base_model,
    "NishKook/legal-qa-lora",
    device_map={"": device_idx},
    torch_dtype=torch.float16,
    use_auth_token=True
).to(device)
model_inf.eval()

def answer(question: str, context: str) -> str:
    prompt = (
        f"### Question:\n{question}\n\n"
        f"### Context:\n{context}\n\n"
        "### Answer:\n"
    )
    inputs = tokenizer_inf(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output_ids = model_inf.generate(
            **inputs,
            max_new_tokens=256,  
            do_sample=False
        )
    # strip off the prompt tokens
    return tokenizer_inf.decode(
        output_ids[0][inputs.input_ids.shape[1]:],
        skip_special_tokens=True
    )


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
question = "What are the four elements of negligence?"
context  = (
    "Under tort law, negligence requires four elements: "
    "a duty of care, a breach of that duty, "
    "causation linking the breach to harm, and actual damages suffered by the plaintiff."
)

print("Question:", question)
print("Answer:  ", answer(question, context))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Question: What are the four elements of negligence?
Answer:   Yes, that is correct. The four elements of a negligence claim under tort law are: (1) a duty of care owed by the defendant to the plaintiff, (2) a breach of that duty by the defendant, (3) causation linking the defendant's breach to harm suffered by the plaintiff, and (4) actual damages or harm suffered by the plaintiff.


In [7]:
question = "When can a contract be considered voidable?"
context  = (
    "A contract may be voidable if one party lacked capacity to contract, "
    "if there was misrepresentation or duress, or if undue influence was exercised."
)

print("Question:", question)
print("Answer:  ", answer(question, context))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Question: When can a contract be considered voidable?
Answer:   A contract may be considered voidable if:

1. One party lacked capacity to contract at the time the contract was entered into. For example, a person who is mentally incompetent or a minor cannot enter into a contract.

2. There was misrepresentation or fraud. Misrepresentation occurs when one party makes a false statement of fact or fails to disclose a material fact, which induces the other party to enter into the contract. Fraud occurs when one party intentionally deceives another party by knowingly making a false statement or concealing a material fact.

3. There was duress. Duress occurs when one party is coerced into entering into a contract by the threat of harm or the infliction of harm upon that party or a third party.

4. Undue influence was exercised. Undue influence occurs when one party takes advantage of the vulnerability of another party to exert pressure on that party to enter into a contract. This can includ