<a href="https://colab.research.google.com/github/Midas0901/Poom/blob/main/ToS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import shutil
import sys

folder_path = '/content/'

# Delete all files and folders
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print(f'Failed to delete {file_path}. Reason: {e}')

# Delete this script itself
try:
    os.unlink(sys.argv[0])
except Exception:
    pass

print("All files, folders, and this script have been deleted.")


In [None]:
# Cell 2
import os
import torch
import numpy as np
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

In [None]:
# Cell 3
from datasets import load_dataset

# If you need to login with huggingface-cli, do it first (not required if dataset is public)
# !huggingface-cli login

ds = load_dataset("MeeraR/legal-qa-dataset")
dataset = ds["train"].select(range(1200))
print("Number of samples loaded:", len(dataset))
dataset[0]


In [None]:
# Cell 4
print("Columns in dataset:", dataset.column_names)

# Assume the text field is "question" (change this if your dataset uses a different field)
sample_text = dataset[0].get("question", None)

print("Example text:", sample_text)


In [None]:
# Cell 5
# Note: If your dataset has real labels, replace this section with the actual label column
labels_map = {
    "Forced Monthly Payments": 0,
    "Data Collection & Sharing with third parties": 1,
    "Unfair Control Over Your Content": 2,
    "No Refund Policy": 3
}

# This example creates rotating placeholder labels for demonstration
df = pd.DataFrame({
    "text": [item.get("question", "") for item in dataset],
    "label": [i % len(labels_map) for i in range(len(dataset))]
})

print(df.shape)
df.head()


In [None]:
# Cell 6
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].values, df["label"].values, test_size=0.2, random_state=42, stratify=df["label"].values
)

print("Train size:", len(train_texts), "Test size:", len(test_texts))


In [None]:
# Cell 7
from transformers import AutoTokenizer

tokenizer_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

def encode_batch(texts, max_length=128):
    return tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

train_enc = encode_batch(train_texts)
test_enc = encode_batch(test_texts)

# ตรวจสอบ shape
print("train input_ids shape:", train_enc["input_ids"].shape)


In [None]:
# Cell 8
import torch
from torch.utils.data import Dataset, DataLoader

class ToSDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]), dtype=torch.long)
        return item

batch_size = 16

train_dataset = ToSDataset(train_enc, train_labels)
test_dataset = ToSDataset(test_enc, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print("DataLoaders are ready")


In [None]:
# Cell 9
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, num_labels=4):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim, padding_idx=tokenizer.pad_token_id)
        self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, num_labels)

    def forward(self, input_ids, attention_mask=None):
        # input_ids: (batch, seq_len)
        x = self.embedding(input_ids)                 # (batch, seq_len, embed_dim)
        # (optionally) mask paddings by zeroing embeddings where attention_mask == 0
        if attention_mask is not None:
            mask = attention_mask.unsqueeze(-1).float()
            x = x * mask
        output, (h_n, c_n) = self.lstm(x)            # h_n: (num_layers * num_dirs, batch, hidden_dim)
        h_last = h_n[-1]                             # (batch, hidden_dim)
        h_last = self.dropout(h_last)
        logits = self.fc(h_last)                     # (batch, num_labels)
        return logits

vocab_size = tokenizer.vocab_size if hasattr(tokenizer, "vocab_size") else len(tokenizer)
model = LSTMClassifier(vocab_size=vocab_size, num_labels=len(labels_map)).to(device)
print(model)


In [None]:
# Cell 10
import torch.optim as optim
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    all_preds = []
    all_labels = []
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch.get("attention_mask", None)
        if attention_mask is not None:
            attention_mask = attention_mask.to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask=attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        all_preds.extend(preds.tolist())
        all_labels.extend(labels.detach().cpu().numpy().tolist())

        loop.set_postfix(loss=total_loss / (len(all_preds) / batch_size))

    train_acc = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1} => loss: {total_loss:.4f}, acc: {train_acc:.4f}")


In [None]:
# Cell 11
from sklearn.metrics import classification_report

model.eval()
preds = []
trues = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating", leave=False):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch.get("attention_mask", None)
        if attention_mask is not None:
            attention_mask = attention_mask.to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask=attention_mask)
        batch_preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
        preds.extend(batch_preds)
        trues.extend(labels.cpu().numpy().tolist())

print(classification_report(trues, preds, target_names=list(labels_map.keys())))


In [None]:
# Cell 12
def classify_tos(text):
    model.eval()
    enc = tokenizer([text], padding=True, truncation=True, max_length=128, return_tensors="pt")
    input_ids = enc["input_ids"].to(device)
    attention_mask = enc.get("attention_mask", None)
    if attention_mask is not None:
        attention_mask = attention_mask.to(device)
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask)
        pred = int(torch.argmax(logits, dim=1).cpu().item())
    return list(labels_map.keys())[pred]

# ตัวอย่าง
print(classify_tos("We may sell your personal information"))
print(classify_tos("You must pay every month even if you don't use the service"))


In [None]:
# Cell: Fast Interactive Model Testing

model.eval()  # call eval() only once
import torch

print("Start testing the model (type 'exit' to quit)\n")

with torch.no_grad():  # wrap the loop to reduce overhead
    while True:
        user_input = input("Enter Terms of Service text:\n")
        if user_input.lower() == "exit":
            print("Exited testing.")
            break

        # encode only once
        enc = tokenizer([user_input], padding=True, truncation=True, max_length=128, return_tensors="pt")
        input_ids = enc["input_ids"].to(device)
        attention_mask = enc.get("attention_mask", None)
        if attention_mask is not None:
            attention_mask = attention_mask.to(device)

        logits = model(input_ids, attention_mask=attention_mask)
        pred = int(torch.argmax(logits, dim=1).cpu().item())
        predicted_label = list(labels_map.keys())[pred]

        print(f"Model predicted ToS category: {predicted_label}\n")
