In [22]:
import pandas as pd
import re
import time
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments
from rouge import Rouge
from bert_score import score
import torch
import os
from transformers import TrainerCallback
import sys

In [16]:
os.environ["WANDB_MODE"] = "disabled"

In [17]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize Pegasus model and tokenizer
model_name = "nsi319/legal-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

In [18]:
# Preprocessing function
def preprocess_text(text):
    """
    Preprocess text by removing extra whitespaces, empty lines, and optionally lowercasing.
    """
    if not isinstance(text, str):  # Ensure the input is a string
        text = str(text) if pd.notna(text) else ""  # Convert NaN or non-string to empty string
    text = re.sub(r'\s+', ' ', text.strip())  # Remove extra whitespaces and line breaks
    text = text.lower()  # Normalize to lowercase (optional)
    text = re.sub(r'[^\w\s.,]', '', text)  # Remove special characters except punctuation (optional)
    return text

def preprocess_dataframe(dataframe, text_columns):
    """
    Apply text preprocessing to specified columns of a dataframe.
    """
    for col in text_columns:
        dataframe[col] = dataframe[col].apply(preprocess_text)
    return dataframe

# Load data from CSV
def load_csv_data(csv_path):
    """
    Load preprocessed data from a CSV file.
    """
    return pd.read_csv(csv_path)

In [19]:
# Paths to CSV files
train_csv = '/content/train-data.csv'
test_csv = '/content/test-data.csv'

# Load and preprocess data
print("Loading train and test data from CSV files...")
train_data = load_csv_data(train_csv)
test_data = load_csv_data(test_csv)

print("Preprocessing train and test data...")
train_data = preprocess_dataframe(train_data, ['judgment_text', 'summary_text'])
test_data = preprocess_dataframe(test_data, ['judgment_text', 'summary_text'])

# Ensure no missing or empty values in the training dataset
train_data = train_data.dropna(subset=['judgment_text', 'summary_text'])
train_data = train_data[train_data['judgment_text'].str.strip() != ""]
train_data = train_data[train_data['summary_text'].str.strip() != ""]

# Ensure no missing or empty values in the testing dataset
test_data = test_data.dropna(subset=['judgment_text', 'summary_text'])
test_data = test_data[test_data['judgment_text'].str.strip() != ""]
test_data = test_data[test_data['summary_text'].str.strip() != ""]

Loading train and test data from CSV files...
Preprocessing train and test data...


In [20]:
# Tokenization
def tokenize_data(data, tokenizer):
    """
    Tokenize judgment texts and summaries for model input.
    """
    source = tokenizer(
        list(data['judgment_text']), truncation=True, padding=True, max_length=512, return_tensors="pt"
    )
    target = tokenizer(
        list(data['summary_text']), truncation=True, padding=True, max_length=150, return_tensors="pt"
    )
    return source, target

# Custom Dataset class for PyTorch
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels["input_ids"][idx])
        return item

    def __len__(self):
        return len(self.labels["input_ids"])

# Tokenize training data
print("Tokenizing training data...")
train_encodings, train_labels = tokenize_data(train_data, tokenizer)
train_dataset = CustomDataset(train_encodings, train_labels)

Tokenizing training data...


In [23]:
# Fine-tuning the model
print("Fine-tuning the model...")
training_args = TrainingArguments(
    output_dir="./results",          # Output directory
    num_train_epochs=3,              # Number of epochs
    per_device_train_batch_size=2,   # Batch size for training
    per_device_eval_batch_size=2,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir="./logs",            # Directory for storing logs
    save_steps=1000,                 # Save checkpoint every 1000 steps
    save_total_limit=3,              # Limit the number of saved checkpoints
    evaluation_strategy="no",        # No evaluation during training
    logging_steps=100                # Log every 100 steps
)

class TrainingLogCallback(TrainerCallback):
    def __init__(self):
        self.start_time = None

    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()

    def on_step_end(self, args, state, control, **kwargs):
        elapsed_time = time.time() - self.start_time
        steps_completed = state.global_step
        total_steps = state.max_steps
        time_per_step = elapsed_time / steps_completed if steps_completed > 0 else 0
        remaining_steps = total_steps - steps_completed
        eta = remaining_steps * time_per_step

        # Overwrite the previous ETA line in the console
        sys.stdout.write(f"\rETA: {eta / 60:.2f} minutes")
        sys.stdout.flush()

        return control

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    callbacks=[TrainingLogCallback()]
)

start_time = time.time()
trainer.train()

# Save the fine-tuned model
print("Saving the fine-tuned model...")
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

Fine-tuning the model...


  trainer = Trainer(
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels["input_ids"][idx])


ETA: 54.17 minutes

Step,Training Loss
100,2.8186
200,2.7285
300,2.6768
400,2.5444
500,2.5055
600,2.485
700,2.3916
800,2.309
900,2.3886
1000,2.3321


ETA: 37.46 minutes

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels["input_ids"][idx])


ETA: 34.78 minutes

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels["input_ids"][idx])


ETA: 31.10 minutes

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels["input_ids"][idx])


ETA: 27.13 minutes

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels["input_ids"][idx])


ETA: 23.08 minutes

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels["input_ids"][idx])


ETA: 18.98 minutes

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels["input_ids"][idx])


ETA: 14.86 minutes

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels["input_ids"][idx])


ETA: 10.68 minutes

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels["input_ids"][idx])


ETA: 6.48 minutes

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels["input_ids"][idx])


ETA: 2.28 minutes

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels["input_ids"][idx])


ETA: 0.00 minutesSaving the fine-tuned model...


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/spiece.model',
 './fine_tuned_model/added_tokens.json')

In [26]:
# Generate summaries for the test set
def summarize_texts(judgment_texts):
    """
    Generate summaries using the fine-tuned model.
    """
    summaries = []
    for text in judgment_texts:
        input_tokens = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=512).to(device)
        summary_ids = model.generate(input_tokens, max_length=150, min_length=30, num_beams=5, length_penalty=2.0)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
        summaries.append(summary)
    return summaries

print("Generating summaries for the test set...")
test_generated_summaries = summarize_texts(test_data['judgment_text'])

# Evaluate summaries using ROUGE
def evaluate_rouge(generated_summaries, reference_summaries):
    """
    Evaluate the generated summaries using ROUGE.
    """
    rouge = Rouge()
    # Convert reference_summaries to a list to match the type of generated_summaries
    reference_summaries = reference_summaries.tolist()
    scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)
    return scores

print("Evaluating ROUGE scores...")
rouge_scores = evaluate_rouge(test_generated_summaries, test_data['summary_text'])

# Print ROUGE scores
print("ROUGE Scores:", rouge_scores)

Generating summaries for the test set...
Evaluating ROUGE scores...
ROUGE Scores: {'rouge-1': {'r': 0.1769526085975771, 'p': 0.5900190137785425, 'f': 0.2613000948437285}, 'rouge-2': {'r': 0.0715308213402306, 'p': 0.3157863594396162, 'f': 0.11147550136442692}, 'rouge-l': {'r': 0.16137592626845934, 'p': 0.544462965770142, 'f': 0.2391108732869936}}


In [27]:
def preprocess_summaries(summaries, max_length=512):
    """
    Preprocess summaries to ensure they are within the model's limits.
    """
    processed = []
    for text in summaries:
        if isinstance(text, str):
            text = text[:max_length]  # Truncate to max_length
        processed.append(text)
    return processed

def evaluate_bert_score(generated_summaries, reference_summaries):
    """
    Evaluate the generated summaries using BERTScore.
    """
    # Preprocess and truncate summaries
    generated_summaries = preprocess_summaries(generated_summaries)
    reference_summaries = preprocess_summaries(reference_summaries.tolist())

    try:
        P, R, F1 = score(generated_summaries, reference_summaries, lang="en", verbose=True)
        return {
            "precision": P.mean().item(),
            "recall": R.mean().item(),
            "f1": F1.mean().item()
        }
    except Exception as e:
        print(f"Error during BERTScore evaluation: {e}")
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}

print("Evaluating BERTScore...")
bert_scores = evaluate_bert_score(test_generated_summaries, test_data['summary_text'])

# Print BERTScore
print("BERTScore:", bert_scores)


Evaluating BERTScore...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 1.19 seconds, 84.30 sentences/sec
BERTScore: {'precision': 0.8552790880203247, 'recall': 0.8572418093681335, 'f1': 0.8562067151069641}
