<a href="https://colab.research.google.com/github/Midas0901/Poom/blob/main/ToS_Analyzer_FineTune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# CELL 1: Install everything (run once)
!pip install -q transformers datasets accelerate peft bitsandbytes trl huggingface_hub sentencepiece tqdm

In [2]:
# CELL 2: Login to Hugging Face (required for pushing model at the end)
from huggingface_hub import login
login()   # ← paste your write token when prompted

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
# CELL 3-5 ใหม่: ใช้ MeeraR/legal-qa-dataset (~2,375 ตัวอย่าง Legal/Privacy Q&A) — แก้ ValueError

from datasets import load_dataset
import pandas as pd

print("กำลังโหลดชุดข้อมูล MeeraR/legal-qa-dataset (2,375+ ตัวอย่างจาก Legal/Privacy Policy จริง)...")
dataset = load_dataset("MeeraR/legal-qa-dataset")  # ← ชื่อถูกต้อง!

# เข้าถึงข้อมูลจาก 'train' split โดยตรง (ไม่มี 'file' column — ใช้ item["question"] + item["answer"])
all_data = []
for item in dataset["train"]:  # loop ผ่าน train split (2,375 ตัวอย่าง)
    all_data.append({
        "question": item["question"],
        "answer": item["answer"],
        "source": "Legal Privacy Policy"  # Default source
    })

final_df = pd.DataFrame(all_data)
print(f"โหลดสำเร็จ! ได้ {len(final_df)} ตัวอย่างจริงจาก Legal Q&A")

# เตรียมสำหรับโมเดล (text = question, label = answer)
final_df["text"] = final_df["question"]
final_df["label"] = final_df["answer"]

print(f"Final training size: {len(final_df)} examples (พร้อมฝึกโมเดล!)")
print("\nตัวอย่างแรก:")
print("คำถาม:", final_df.iloc[0]["text"])
print("คำตอบ:", final_df.iloc[0]["label"])

กำลังโหลดชุดข้อมูล MeeraR/legal-qa-dataset (2,375+ ตัวอย่างจาก Legal/Privacy Policy จริง)...
โหลดสำเร็จ! ได้ 2375 ตัวอย่างจริงจาก Legal Q&A
Final training size: 2375 examples (พร้อมฝึกโมเดล!)

ตัวอย่างแรก:
คำถาม: Explain the rights granted by Section 12 of the Indian Penal Code.
คำตอบ: This section ensures legal aid for individuals against sexual harassment.


In [8]:
# CELL ใหม่: แปลงเป็นรูปแบบแชท + เพิ่มข้อมูลของคุณเอง (ใช้แทน CELL 5 เดิม)

from datasets import Dataset

def make_chat(row):
    return {
        "messages": [
            {"role": "system", "content": "คุณคือผู้เชี่ยวชาญด้าน Privacy Policy และ Terms of Service ตอบให้ชัดเจน อ่านง่าย และอ้างอิงแหล่งที่มาเสมอ ไม่ใช่คำแนะนำทางกฎหมาย"},
            {"role": "user", "content": row["text"]},  # คำถาม
            {"role": "assistant", "content": row["label"]}  # คำตอบ
        ]
    }

# สร้าง list ของ chat examples
chat_list = [make_chat(row) for _, row in final_df.iterrows()]

# **เพิ่มข้อมูลของคุณเองที่นี่ (ถ้าพร้อม) — เพื่อให้ได้ ~1,200 จากของคุณ + 2,375 จาก dataset**
# อัปโหลดไฟล์ของคุณ (CSV/JSONL) ทางซ้าย Colab แล้ว uncomment บรรทัดนี้:
# your_df = pd.read_json("your_qa.jsonl", lines=True)  # หรือ pd.read_csv("your_file.csv")
# your_df = your_df.rename(columns={"prompt": "text", "response": "label"})  # ปรับคอลัมน์ให้ตรง
# your_chat_list = [make_chat(row) for _, row in your_df.iterrows()]
# chat_list.extend(your_chat_list)  # รวมข้อมูลของคุณเข้าไป
# print(f"เพิ่มข้อมูลของคุณ: +{len(your_df)} examples (รวมทั้งหมด {len(chat_list)})")

# ทำ Dataset สำหรับฝึก
full_dataset = Dataset.from_list(chat_list)
full_dataset = full_dataset.train_test_split(test_size=0.1, seed=42)  # 90% train, 10% test

print(f"พร้อมฝึก! Train: {len(full_dataset['train'])}, Test: {len(full_dataset['test'])}")

พร้อมฝึก! Train: 2137, Test: 238


In [None]:
# CELL 6: FIXED — Load the real Saul-7B legal model (fits in free Colab)

model_name = "Equall/Saul-7B-Instruct-v1"   # ← CORRECT NAME: Best open legal model 2025

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# 4-bit quantization to fit in free Colab (reduces RAM from 14GB to ~4GB)
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True  # Extra compression for speed
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side="left"  # Important for chat templates
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading model... (this takes ~2-3 minutes)")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",  # Automatically uses GPU if available
    torch_dtype=torch.float16,
    trust_remote_code=True,
    attn_implementation="flash_attention_2" if torch.cuda.is_available() else None  # Faster inference
)

print("Model loaded successfully! Ready for fine-tuning.")
print(f"Model size: ~7B parameters | VRAM used: ~{torch.cuda.memory_allocated()/1e9:.1f} GB")

Loading tokenizer...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading model... (this takes ~2-3 minutes)


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
# CELL 7: Fine-tune with LoRA (1–3 hours)

from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
import transformers

peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules="all-linear",
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

trainer = SFTTrainer(
    model=model,
    train_dataset=full_dataset["train"],
    eval_dataset=full_dataset["test"],
    dataset_text_field="messages",
    tokenizer=tokenizer,
    max_seq_length=2048,
    formatting_func=lambda examples: [tokenizer.apply_chat_template(ex["messages"], tokenize=False) for ex in examples],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8,
        warmup_steps=100,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=50,
        output_dir="raft-tos-analyzer",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
        evaluation_strategy="epoch",
        report_to="none",
    ),
)

print("Starting training...")
trainer.train()

In [None]:
# CELL 8: Merge and save final model

model.save_pretrained("raft-tos-analyzer-final")
tokenizer.save_pretrained("raft-tos-analyzer-final")

# Merge LoRA weights
!pip install -q peft
from peft import AutoPeftModelForCausalLM

merged_model = AutoPeftModelForCausalLM.from_pretrained(
    "raft-tos-analyzer-final",
    device_map="auto",
    torch_dtype=torch.float16
)
merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained("raft-tos-analyzer-merged")
tokenizer.save_pretrained("raft-tos-analyzer-merged")

In [None]:
# CELL 9: Push your final model to Hugging Face

new_model_name = "yourusername/raft-tos-analyzer-v1"   # ← CHANGE "yourusername" !

merged_model.push_to_hub(new_model_name)
tokenizer.push_to_hub(new_model_name)

print("FINISHED!")
print(f"Your model is now live at: https://huggingface.co/{new_model_name}")

In [None]:
# CELL 10: Quick test

from transformers import pipeline
pipe = pipeline("text-generation", model=new_model_name, device_map="auto")

test = pipe("Does this app sell my personal data?", max_new_tokens=200)
print(test[0]["generated_text"])