In [1]:
pip install transformers datasets torch


Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
from datasets import Dataset

# Combine datasets into a single list
files = ['constitution_qa.json', 'crpc_qa.json', 'ipc_qa.json']

def load_datasets(file_list):
    """Load and combine datasets."""
    data = []
    for file in file_list:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data.extend(json.load(f))
        except Exception as e:
            print(f"Error loading {file}: {e}")
    return data

# Load and format the dataset
data = load_datasets(files)
formatted_data = [{"question": item["question"], "answer": item["answer"]} for item in data]

# Convert to Hugging Face Dataset format
hf_dataset = Dataset.from_list(formatted_data)
hf_dataset = hf_dataset.train_test_split(test_size=0.2)

print(hf_dataset)


DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 11634
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 2909
    })
})


In [3]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Tokenize the dataset
def preprocess_data(batch):
    inputs = tokenizer(
        batch["question"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    outputs = tokenizer(
        batch["answer"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    inputs["labels"] = outputs["input_ids"]
    return inputs

tokenized_dataset = hf_dataset.map(preprocess_data, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./legal_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save the model
trainer.save_model("./legal_fine_tuned_model")
tokenizer.save_pretrained("./legal_fine_tuned_model")


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/11634 [00:00<?, ? examples/s]

Map:   0%|          | 0/2909 [00:00<?, ? examples/s]

TypeError: Accelerator.__init__() got an unexpected keyword argument 'dispatch_batches'

In [4]:
pip install --upgrade transformers accelerate


Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/7b/9f/92d3091c44cb19add044064af1bf1345cd35fbb84d32a3690f912800a295/transformers-4.48.1-py3-none-any.whl.metadata
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: HTTPSConnectionPool(host='files.pythonhosted.org', port=443): Max retries exceeded with url: /packages/7b/9f/92d3091c44cb19add044064af1bf1345cd35fbb84d32a3690f912800a295/transformers-4.48.1-py3-none-any.whl.metadata (Caused by NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x000001685ACACD90>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))



In [5]:
pip show transformers accelerate


Name: transformers
Version: 4.32.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: C:\Users\mruna\anaconda3\Lib\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 
---
Name: accelerate
Version: 1.3.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: C:\Users\mruna\anaconda3\Lib\site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [6]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
from datasets import Dataset

# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Tokenize the dataset
def preprocess_data(batch):
    inputs = tokenizer(
        batch["question"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    outputs = tokenizer(
        batch["answer"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    inputs["labels"] = outputs["input_ids"]
    return inputs

# Sample dataset structure
dataset = {
    "train": [
        {"question": "What is Article 14?", "answer": "Article 14 states equality before the law."},
        {"question": "What is CrPC Section 41?", "answer": "Section 41 mentions arrests without a warrant."},
    ],
    "test": [
        {"question": "What is Article 21?", "answer": "Article 21 guarantees the right to life and liberty."},
    ],
}

# Convert dataset to Hugging Face Dataset format
hf_dataset = Dataset.from_dict(dataset)
hf_dataset = hf_dataset.train_test_split(test_size=0.2)

# Preprocess the dataset
tokenized_dataset = hf_dataset.map(preprocess_data, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./legal_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save the model
trainer.save_model("./legal_fine_tuned_model")
tokenizer.save_pretrained("./legal_fine_tuned_model")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ArrowInvalid: Column 1 named test expected length 2 but got length 1

In [7]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Sample dataset
train_data = [
    {"question": "What is Article 14?", "answer": "Article 14 states equality before the law."},
    {"question": "What is CrPC Section 41?", "answer": "Section 41 mentions arrests without a warrant."},
]
test_data = [
    {"question": "What is Article 21?", "answer": "Article 21 guarantees the right to life and liberty."},
]

# Convert dataset to Hugging Face Dataset format
hf_dataset = DatasetDict({
    "train": Dataset.from_dict({"question": [d["question"] for d in train_data], "answer": [d["answer"] for d in train_data]}),
    "test": Dataset.from_dict({"question": [d["question"] for d in test_data], "answer": [d["answer"] for d in test_data]}),
})

# Preprocess the dataset
def preprocess_data(batch):
    inputs = tokenizer(
        batch["question"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    outputs = tokenizer(
        batch["answer"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    inputs["labels"] = outputs["input_ids"]
    return inputs

tokenized_dataset = hf_dataset.map(preprocess_data, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./legal_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save the model
trainer.save_model("./legal_fine_tuned_model")
tokenizer.save_pretrained("./legal_fine_tuned_model")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

TypeError: Accelerator.__init__() got an unexpected keyword argument 'dispatch_batches'

In [8]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [9]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Sample dataset
train_data = [
    {"question": "What is Article 14?", "answer": "Article 14 states equality before the law."},
    {"question": "What is CrPC Section 41?", "answer": "Section 41 mentions arrests without a warrant."},
]
test_data = [
    {"question": "What is Article 21?", "answer": "Article 21 guarantees the right to life and liberty."},
]

# Convert dataset to Hugging Face Dataset format
hf_dataset = DatasetDict({
    "train": Dataset.from_dict({"question": [d["question"] for d in train_data], "answer": [d["answer"] for d in train_data]}),
    "test": Dataset.from_dict({"question": [d["question"] for d in test_data], "answer": [d["answer"] for d in test_data]}),
})

# Preprocess the dataset
def preprocess_data(batch):
    inputs = tokenizer(
        batch["question"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    outputs = tokenizer(
        batch["answer"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    inputs["labels"] = outputs["input_ids"]
    return inputs

tokenized_dataset = hf_dataset.map(preprocess_data, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./legal_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save the model
trainer.save_model("./legal_fine_tuned_model")
tokenizer.save_pretrained("./legal_fine_tuned_model")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

TypeError: Accelerator.__init__() got an unexpected keyword argument 'dispatch_batches'

In [10]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch

# Load model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Sample dataset
train_data = [
    {"question": "What is Article 14?", "answer": "Article 14 states equality before the law."},
    {"question": "What is CrPC Section 41?", "answer": "Section 41 mentions arrests without a warrant."},
]
test_data = [
    {"question": "What is Article 21?", "answer": "Article 21 guarantees the right to life and liberty."},
]

# Preprocess dataset
def preprocess_data(data, tokenizer):
    inputs = tokenizer(
        [item["question"] for item in data],
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    outputs = tokenizer(
        [item["answer"] for item in data],
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    inputs["labels"] = outputs["input_ids"]
    return inputs

train_dataset = preprocess_data(train_data, tokenizer)
test_dataset = preprocess_data(test_data, tokenizer)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Fine-tune model
model.train()
for epoch in range(3):  # Number of epochs
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"],
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1} completed.")

# Save the model
model.save_pretrained("./legal_fine_tuned_model")
tokenizer.save_pretrained("./legal_fine_tuned_model")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'tokenizers.Encoding'>

In [11]:
def custom_collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]

    # Pad sequences to a common maximum length
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # -100 for padding

    return {
        "input_ids": torch.tensor(input_ids),
        "attention_mask": torch.tensor(attention_mask),
        "labels": torch.tensor(labels),
    }

In [12]:
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=custom_collate_fn)

In [13]:
def custom_collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]

    # Pad sequences to a common maximum length
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # -100 for padding

    return {
        "input_ids": torch.tensor(input_ids),
        "attention_mask": torch.tensor(attention_mask),
        "labels": torch.tensor(labels),
    }

In [14]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import torch
from torch.nn.utils.rnn import pad_sequence

# Load model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Sample dataset
train_data = [
    {"question": "What is Article 14?", "answer": "Article 14 states equality before the law."},
    {"question": "What is CrPC Section 41?", "answer": "Section 41 mentions arrests without a warrant."},
]
test_data = [
    {"question": "What is Article 21?", "answer": "Article 21 guarantees the right to life and liberty."},
]

# Option 1: Using a custom collate function (less preferred, but demonstrating the concept)
def custom_collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]

    # Pad sequences to a common maximum length
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # Important for CrossEntropyLoss

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


# Option 2: Using a custom Dataset class (Recommended)
class LegalQADataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = self.tokenizer(
            item["question"], item["answer"], truncation=True, max_length=128, padding="max_length", return_tensors="pt"
        )
        encoding["labels"] = encoding["input_ids"].clone() # Important: copy input_ids to labels
        return encoding

# Choose either Option 1 or Option 2:
# Using Option 2 (Recommended):
train_dataset = LegalQADataset(train_data, tokenizer)
test_dataset = LegalQADataset(test_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)

# Fine-tune model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) # Move model to device
model.train()

for epoch in range(3):  # Number of epochs
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()} # Move batch to device
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1} completed.")

# Save the model
model.save_pretrained("./legal_fine_tuned_model")
tokenizer.save_pretrained("./legal_fine_tuned_model")

print("Training finished. Model saved.")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: BertForQuestionAnswering.forward() got an unexpected keyword argument 'labels'

In [15]:
pip install transformers datasets torch




In [19]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import torch
from torch.nn.utils.rnn import pad_sequence

# Model name
model_name = "bert-base-uncased"  # Or any other suitable QA model

# Load and format the dataset
files = ['constitution_qa.json', 'crpc_qa.json', 'ipc_qa.json']

def load_datasets(file_list):
    """Load and combine datasets."""
    data = []
    for file in file_list:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data.extend(json.load(f))
        except FileNotFoundError:
            print(f"Error: File not found: {file}")
        except json.JSONDecodeError:
            print(f"Error: Invalid JSON format in: {file}")
        except Exception as e:
            print(f"An unexpected error occurred while loading {file}: {e}")
    return data

data = load_datasets(files)

if not data:  # Check if data loading was successful
    print("No data loaded. Exiting.")
    exit()

formatted_data = [{"question": item["question"], "answer": item["answer"]} for item in data]

# Convert to Hugging Face Dataset format
hf_dataset = Dataset.from_list(data)
hf_dataset = hf_dataset.train_test_split(test_size=0.2)

print(hf_dataset)

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Custom Dataset class
class LegalQADataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = self.tokenizer(
            item["question"], item["answer"], truncation=True, max_length=128, padding="max_length", return_tensors="pt"
        )
        encoding["labels"] = encoding["input_ids"].clone()
        return encoding

# Create datasets and dataloaders
train_dataset = LegalQADataset(hf_dataset["train"], tokenizer)
test_dataset = LegalQADataset(hf_dataset["test"], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2) # No need to shuffle the test data

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

for epoch in range(3):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1} completed.")

# Save the model
model.save_pretrained("./legal_fine_tuned_model")
tokenizer.save_pretrained("./legal_fine_tuned_model")

print("Training finished. Model saved.")

AttributeError: type object 'Dataset' has no attribute 'from_list'

In [17]:
pip install --upgrade datasets

Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/d7/84/0df6c5981f5fc722381662ff8cfbdf8aad64bec875f75d80b55bfef394ce/datasets-3.2.0-py3-none-any.whl.metadata
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: HTTPSConnectionPool(host='files.pythonhosted.org', port=443): Max retries exceeded with url: /packages/d7/84/0df6c5981f5fc722381662ff8cfbdf8aad64bec875f75d80b55bfef394ce/datasets-3.2.0-py3-none-any.whl.metadata (Caused by NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x0000024E48181A50>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))



In [20]:
import json
from datasets import Dataset, DatasetDict

# ... (your load_datasets function)
from datasets import Dataset

data = [
    {
        "question": "What is the short title, extent and commencement referred to in the Code of Criminal Procedure, 1973?",
        "answer": "Section 1 of Chapter I PRELIMINARY"
    },
    {
        "question": "Which section of the Code of Criminal Procedure, 1973 refers to the short title, extent and commencement?",
        "answer": "Section 1 of Chapter I PRELIMINARY"
    },
    {
        "question": "What does Section 2 of the Code of Criminal Procedure, 1973 define?",
        "answer": "Definitions"
    },
]

# Convert to dictionary format suitable for Dataset.from_dict
data_dict = {
    "question": [item["question"] for item in data],
    "answer": [item["answer"] for item in data],
}

# Create the Dataset
hf_dataset = Dataset.from_dict(data_dict)

# Optional: Train/test split
hf_dataset = hf_dataset.train_test_split(test_size=0.33, seed=42) # Example 33% test split with a fixed seed

print(hf_dataset)

# Access train and test splits
train_dataset = hf_dataset["train"]
test_dataset = hf_dataset["test"]

print("Train Dataset:")
print(train_dataset)
print("Test Dataset:")
print(test_dataset)

# Example to show how to access data:
print("\nExample Data Access:")
print("First question in train dataset:", train_dataset[0]["question"])
print("First answer in train dataset:", train_dataset[0]["answer"])

print("First question in test dataset:", test_dataset[0]["question"])
print("First answer in test dataset:", test_dataset[0]["answer"])

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 2
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1
    })
})
Train Dataset:
Dataset({
    features: ['question', 'answer'],
    num_rows: 2
})
Test Dataset:
Dataset({
    features: ['question', 'answer'],
    num_rows: 1
})

Example Data Access:
First question in train dataset: Which section of the Code of Criminal Procedure, 1973 refers to the short title, extent and commencement?
First answer in train dataset: Section 1 of Chapter I PRELIMINARY
First question in test dataset: What does Section 2 of the Code of Criminal Procedure, 1973 define?
First answer in test dataset: Definitions


In [21]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import torch
from torch.nn.utils.rnn import pad_sequence

# Model name
model_name = "bert-base-uncased"  # Or a more suitable QA model like "deepset/bert-base-cased-squad2"

# Load and format the dataset
def load_datasets(file_list):
    data = []
    for file in file_list:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data.extend(json.load(f))
        except FileNotFoundError:
            print(f"Error: File not found: {file}")
        except json.JSONDecodeError:
            print(f"Error: Invalid JSON format in: {file}")
        except Exception as e:
            print(f"An unexpected error occurred while loading {file}: {e}")
    return data

files = ['constitution_qa.json', 'crpc_qa.json', 'ipc_qa.json']  # Replace with your file names
data = load_datasets(files)

if not data:
    print("No data loaded. Exiting.")
    exit()

formatted_data = [{"question": item["question"], "answer": item["answer"]} for item in data]

# Convert to Hugging Face Dataset format (using from_dict)
data_dict = {
    "question": [item["question"] for item in formatted_data],
    "answer": [item["answer"] for item in formatted_data],
}
hf_dataset = Dataset.from_dict(data_dict)
hf_dataset = hf_dataset.train_test_split(test_size=0.2, seed=42)

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Custom Dataset class
class LegalQADataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = self.tokenizer(
            item["question"], item["answer"], truncation=True, max_length=128, padding="max_length", return_tensors="pt"
        )
        encoding["labels"] = encoding["input_ids"].clone()  # Correct label creation
        return encoding

# Create datasets and dataloaders
train_dataset = LegalQADataset(hf_dataset["train"], tokenizer)
test_dataset = LegalQADataset(hf_dataset["test"], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(3):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1} completed.")

# Save the model
model.save_pretrained("./legal_fine_tuned_model")
tokenizer.save_pretrained("./legal_fine_tuned_model")
print("Training finished. Model saved.")


# Inference/Question Answering function
def answer_question(question, context):
    inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation="only_second")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    if answer_start >= len(inputs["input_ids"][0]) or answer_end > len(inputs["input_ids"][0]):
        return "Answer not found in context."

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    return answer

# Example Usage (after training)
context = """
Article 14 of the Indian Constitution ensures equality before the law. 
It states that the State shall not deny to any person equality before the law or the equal protection of the laws within the territory of India.
"""
question = "What does Article 14 ensure?"
answer = answer_question(question, context)
print(f"Question: {question}")
print(f"Answer: {answer}")

# ... More examples

AttributeError: type object 'Dataset' has no attribute 'from_dict'

In [24]:
import json
import os

# List of files to validate
files = ['constitution_qa.json', 'crpc_qa.json', 'ipc_qa.json']

def validate_dataset(file_path):
    """
    Validates the structure of a dataset to ensure all entries have `question` and `answer` fields.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return False

    if not isinstance(data, list):
        print(f"Error in {file_path}: Root structure is not a list.")
        return False

    errors = []
    for idx, entry in enumerate(data):
        if not isinstance(entry, dict):
            errors.append((idx, "Entry is not a dictionary."))
            continue

        if 'question' not in entry:
            errors.append((idx, "Missing 'question' field."))
        elif not isinstance(entry['question'], str) or not entry['question'].strip():
            errors.append((idx, "'question' is empty or not a string."))

        if 'answer' not in entry:
            errors.append((idx, "Missing 'answer' field."))
        elif not isinstance(entry['answer'], str) or not entry['answer'].strip():
            errors.append((idx, "'answer' is empty or not a string."))

    if errors:
        print(f"Validation errors in {file_path}:")
        for idx, error in errors:
            print(f"  Entry {idx}: {error}")
        return False

    print(f"{file_path} is valid.")
    return True


# Run validation for all files
for file in files:
    if os.path.exists(file):
        print(f"Validating {file}...")
        validate_dataset(file)
        print("\n")
    else:
        print(f"File {file} not found.\n")



Validating constitution_qa.json...
constitution_qa.json is valid.


Validating crpc_qa.json...
crpc_qa.json is valid.


Validating ipc_qa.json...
ipc_qa.json is valid.




In [25]:
import json

# Load datasets
files = ['constitution_qa.json', 'crpc_qa.json', 'ipc_qa.json']

def load_datasets(file_list):
    """Load and combine datasets into a single list."""
    data = []
    for file in file_list:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data.extend(json.load(f))
        except Exception as e:
            print(f"Error loading {file}: {e}")
    return data

# Load all questions and answers
data = load_datasets(files)

# Normalize data for better matching
def normalize(text):
    """Normalize text for case-insensitive and whitespace-tolerant matching."""
    return ' '.join(text.lower().strip().split())

# Preprocess the dataset
qa_pairs = [(normalize(item['question']), item['answer']) for item in data]

def find_answer(user_query):
    """Find the best matching answer for the user's query."""
    query = normalize(user_query)
    for question, answer in qa_pairs:
        if query in question:  # Simple substring matching
            return answer
    return "Sorry, I couldn't find an answer to your question."

# Test the chatbot with sample queries
if __name__ == "__main__":
    print("Welcome to the Legal Chatbot! Ask your legal questions:")
    while True:
        user_query = input("You: ")
        if user_query.lower() in ['exit', 'quit']:
            print("Chatbot: Goodbye!")
            break
        response = find_answer(user_query)
        print(f"Chatbot: {response}")


Welcome to the Legal Chatbot! Ask your legal questions:
You: exit
Chatbot: Goodbye!


In [1]:
import json
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
files = ['constitution_qa.json', 'crpc_qa.json', 'ipc_qa.json']

def load_datasets(file_list):
    """Load and combine datasets into a single list."""
    data = []
    for file in file_list:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data.extend(json.load(f))
        except Exception as e:
            print(f"Error loading {file}: {e}")
    return data

# Load all questions and answers
data = load_datasets(files)
questions = [item['question'] for item in data]
answers = [item['answer'] for item in data]

# Load pre-trained model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def embed_text(texts):
    """Generate embeddings for a list of texts."""
    tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings

# Compute embeddings for all questions
question_embeddings = embed_text(questions)

# Save embeddings and answers for later use
torch.save(question_embeddings, 'question_embeddings.pt')
with open('answers.json', 'w', encoding='utf-8') as f:
    json.dump(answers, f)


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 4199553024 bytes.

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
files = ['constitution_qa.json', 'crpc_qa.json', 'ipc_qa.json']

def load_datasets(file_list):
    """Load and combine datasets into a single list."""
    data = []
    for file in file_list:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data.extend(json.load(f))
        except Exception as e:
            print(f"Error loading {file}: {e}")
    return data

# Load all questions and answers
data = load_datasets(files)
questions = [item['question'] for item in data]
answers = [item['answer'] for item in data]

# Load pre-trained model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def embed_text(texts):
    """Generate embeddings for a list of texts."""
    tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings

# Compute embeddings for all questions
question_embeddings = embed_text(questions)

# Save embeddings and answers for later use
torch.save(question_embeddings, 'question_embeddings.pt')
with open('answers.json', 'w', encoding='utf-8') as f:
    json.dump(answers, f)


In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# Function to embed texts in smaller batches
def embed_text_in_batches(texts, batch_size=32):
    """Generate embeddings for a list of texts in batches."""
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**tokens)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
            embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)

# Compute embeddings in batches
question_embeddings = embed_text_in_batches(questions, batch_size=32)

# Save embeddings and answers
torch.save(question_embeddings, 'question_embeddings.pt')
with open('answers.json', 'w', encoding='utf-8') as f:
    json.dump(answers, f)


100%|████████████████████████████████████████████████████████████████████████████████| 455/455 [01:33<00:00,  4.85it/s]


In [3]:
import torch
import json
from sklearn.metrics.pairwise import cosine_similarity

# Load precomputed embeddings and answers
question_embeddings = torch.load('question_embeddings.pt')
with open('answers.json', 'r', encoding='utf-8') as f:
    answers = json.load(f)


  question_embeddings = torch.load('question_embeddings.pt')


In [4]:
from transformers import AutoTokenizer, AutoModel

# Load model and tokenizer (ensure it matches the one used for embeddings)
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def find_best_match(user_query):
    """
    Finds the most relevant answer to the user's query using cosine similarity.
    """
    # Generate embedding for the user query
    tokens = tokenizer(user_query, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
        query_embedding = outputs.last_hidden_state.mean(dim=1)  # Mean pooling

    # Compute cosine similarity between query and question embeddings
    similarities = cosine_similarity(query_embedding.numpy(), question_embeddings.numpy())

    # Find the index of the most similar question
    best_match_idx = similarities.argmax()
    return answers[best_match_idx]




In [None]:
if __name__ == "__main__":
    print("Welcome to the Legal Chatbot! Ask your legal questions below.")
    print("Type 'exit' or 'quit' to end the session.\n")

    while True:
        user_query = input("You: ")
        if user_query.lower() in ["exit", "quit"]:
            print("Chatbot: Goodbye!")
            break

        # Retrieve and display the best answer
        response = find_best_match(user_query)
        print(f"Chatbot: {response}")


Welcome to the Legal Chatbot! Ask your legal questions below.
Type 'exit' or 'quit' to end the session.

You: what is rape law?
Chatbot: Rigorous imprisonment for a term which shall not be less than 20 years but which may extend to imprisonment for life, which shall mean imprisonment for the remainder of that person’s natural life and with fine.
You: what is murder law?
Chatbot: Section 300
You: what is article 42 of constitution?
Chatbot: Any law referred to in article 2 or article 3 is not deemed to be an amendment of the Constitution for the purposes of article 368.
You: union territories in india? 
Chatbot: Every Union territory shall be administered by the President acting, to such extent as he thinks fit, through an administrator to be appointed by him with such designation as he may specify.
You: How many states are there in india?
Chatbot: Assam, Meghalaya, Tripura and Mizoram
You: What is the subject of Section 5 in the Code of Criminal Procedure, 1973?
Chatbot: Court of Sessi