In [5]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch
import os

class LegalSumDataset(Dataset):
    def __init__(self, texts, summaries):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]
        encoding = self.tokenizer(text, summary, return_tensors='pt', max_length=512, truncation=True, padding=True)
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': encoding['input_ids'].flatten()
        }

# Load test data and summaries
test_data_path = 'dataset/IN-Abs/test data/judgement'
summary_path = 'dataset/IN-Abs/test data/summary'

texts = []
summaries = []

for file_name in os.listdir(test_data_path):
    if file_name.endswith('.txt'):
        with open(os.path.join(test_data_path, file_name), 'r', encoding='utf-8') as file:
            text = file.read()
            texts.append(text)

for file_name in os.listdir(summary_path):
    if file_name.endswith('.txt'):
        with open(os.path.join(summary_path, file_name), 'r', encoding='utf-8') as file:
            summary = file.read()
            summaries.append(summary)

# Create dataset and dataloader
dataset = LegalSumDataset(texts, summaries)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Set up model, optimizer, and device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Adjust num_labels based on your task
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)

# Fine-tune the model
num_epochs = 3  # Adjust as needed

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc=f'Epoch {epoch + 1}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    print(f'Epoch {epoch + 1}, Average Loss: {average_loss}')

# Save the fine-tuned model
model.save_pretrained('fine_tuned_legal_sum_model')


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'dataset/IN-Abs/test data/judgement'