In [None]:
import torch
from datasets import load_dataset
from transformers import BertTokenizer

In [None]:
cuad = load_dataset("json", data_files={"train": "/Users/keshavsaraogi/data/cuad/CUAD_v1.json"})

In [None]:
tokenizer = BertTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

In [None]:
# Function to preprocess data
def preprocess_data(example):
    if "context" not in example or "question" not in example or "answers" not in example:
        print(f"Missing keys in example: {example.keys()}")
        return {}

    inputs = tokenizer(
        example["context"], example["question"],
        truncation=True, padding="max_length", max_length=512, return_tensors="pt"
    )

    labels = tokenizer(
        example["answers"]["text"][0] if example["answers"]["text"] else "", 
        truncation=True, padding="max_length", max_length=128, return_tensors="pt"
    )

    return {
        "input_ids": inputs["input_ids"].squeeze(),
        "attention_mask": inputs["attention_mask"].squeeze(),
        "labels": labels["input_ids"].squeeze()
    }


In [None]:
# Apply preprocessing
train_dataset = cuad["train"].map(preprocess_data)

In [None]:
# Save processed dataset for training
torch.save(train_dataset, "processed_cuad.pt")

print("Dataset processing complete! Ready for fine-tuning LegalBERT.")