# LegalNLP Summarization

In [2]:
from IPython import get_ipython # This function gives one access to the current IPython Shell (used in jupyter nb), jupyter nb uses IPython under the hood

get_ipython().cache_size = 0 # Generally what output is generated for each cell is stored in RAM, but switching off this doesnt store it (cause later on we would be training in batch sizes of 64 and we'd need RAM)


In [None]:
!pip install torch

In [3]:

# Now the below code snippet is meant to clean the current py workspace
import gc # our garbage collector
import torch

def clean_workspace():
    print("Cleaning workspace...")

    # Delete all global variables in the global scope except system modules
    global_vars = list(globals().keys)
    for var in global_vars:
        if var not in ["gc", "torch", "clean_workspace"]: # Keep required modules and function
            del globals()[var]
        print("Clearing GPU memory")
        torch.cuda.synchronize() # Synchronize all pending GPU operations
        torch.cuda.empty_cache() # Clears unused memory from GPU that pyTorch was holding onto 
        print("Running Garbage Collection...")
        gc.collect()
        
        print("Workspace Cleaned...")

clean_workspace()


ModuleNotFoundError: No module named 'torch'

In [None]:
!pip install sentence_transformers bert_score evaluate
!pip install rouge_score tqdm
!pip install nltk torch scikit-learn pandas


In [None]:
# We do see 2 T4 GPUs on kaggle but lets verify it
print(torch.cuda.is_available())

In [None]:
import pandas as pd
import numpy as np
import shutil
import random
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from datasets import Dataset
import nltk
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import evaluate
from bert_score import score
from tqdm import tqdm

# Sentence Transformer for embeddings (it embeds a complete sentence rather than per-token wise)
sentence_model = SentenceTransformer("Stern5497/sbert-legal-xlm-roberta-base", device="cuda") # We use gpu since it was available
# As per my test its length is 768
# embedding = model.encode("The defendant shall appear before the court")
# print(embedding.shape) = (768,)

# Now lets create/get our train and test datasets

We get data from multiple sources and prepare them into one common one for our summarization

In [None]:
# Our final dataset
data_train = [] # Stores json of judgement + summary
data_test = []

## Civil Sum dataset 

In [None]:
df_civilSum_train = pd.read_csv("/kaggle/input/civilsum-dataset/CivilSum_train_set.csv")
df_civilSum_test = pd.read_csv("/kaggle/input/civilsum-dataset/CivilSum_test_set.csv")

## ILC dataset

In [None]:
df_ILC_train = pd.read_csv("/kaggle/input/ilc-dataset/ILC_train_set.csv")
df_ILC_test = pd.read_csv("/kaggle/input/ilc-dataset/ILC_test_set.csv")

In [None]:
# loading civilSum dataset

def load_data_civilSum(df, data):

    for _, row in df.iterrows():
        judgement = row['text']
        summary = row['summary']
        data.append({
            "judgement": judgement,
            "summary": summary
        })

# loading ILC dataset

def load_data_ILC(df, data):
    for _, row in df.iterrows():
        judgement = row['Case']
        summary = row['Summary']
        data.append({
            "judgement": judgement,
            "summary": summary
        })

# Loading IN-Abs dataset

# judgement_folder = path to folder containing judgement text files
# summary_folder = path to folder containing summary text files
# max_files = max number of files to read

def load_data(judgement_folder, summary_folder, data):
    judgement_files = sorted(os.listdir(judgement_folder))
    summary_files = sorted(os.listdir(summary_folder))

    # Above judgement 001.txt corresponds to summary 001.txt
    # Now lets loop through all those files in above 2 folders
    for judgement_file, summary_file in zip(judgement_files, summary_files):
        with open(os.path.join(judgement_folder, judgement_file), 'r') as f:
            judgement = f.read()
        with open(os.path.join(summary_folder, summary_file), 'r') as f:
            summary = f.read()

        # Now we create a {"judgement": judgement_text, "summary": summary_text} json representing each datapoint
        data.append({"judgement": judgement, "summary": summary})

In [None]:
# now lets load our dataset

load_data_civilSum(df_civilSum_train, data_train)
load_data_civilSum(df_civilSum_test, data_test)

load_data_ILC(df_ILC_train, data_train)
load_data_ILC(df_ILC_test, data_test)

# Now lets load load the training and test data from our data set and then we store as a hugging face dataset
# Huggingface dataset is a GPU-optimised, transformer-friendly version of a dataframe

# Load training and test data
load_data("/kaggle/input/legal-data-set/dataset/IN-Abs/train-data/judgement", "/kaggle/input/legal-data-set/dataset/IN-Abs/train-data/summary", data_train)
load_data("/kaggle/input/legal-data-set/dataset/IN-Abs/test-data/judgement", "/kaggle/input/legal-data-set/dataset/IN-Abs/test-data/summary", data_test)

train_data_in_abs = data_train.copy()
test_data_in_abs = data_test.copy()

In [None]:
print(len(train_data_in_abs))
print(len(test_data_in_abs))

# Extractive Summarization

We do not have a large enough dataset for extractive summarization so hence we get our own ones

In [None]:
# Below we use to store our extractive datasets

train_data_in_ext = []
test_data_in_ext = []

In [None]:
import nltk
from nltk.data import find

try:
    find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

### Now lets generate our extractive datasets

In [None]:
# We had already initialised our sentence transformer model
# sentence_model = SentenceTransformer("Stern5497/sbert-legal-xlm-roberta-base", device="cuda") # We use gpu since it was available

# cuda (gpu) is available, we had already checked earlier
device = "cuda"

# lets move the sentence_model to gpu
sentence_model = sentence_model.to(device)

# Maximum token limit for extractive summary output
MAX_TOKENS = 1024

# Function to split a judgement text into sentences using NLTK
def split_into_sentences(judgement):
    return nltk.sent_tokenize(judgement)

# Function to get embeddings for each sentence (using GPU/CPU)
def get_sentence_embeddings(sentences):
    # Use the SentenceTransformer to get embeddings for each sentence
    embeddings = sentence_model.encode(sentences, convert_to_tensor=True, show_progress_bar=False, device=device, batch_size=32)
    return embeddings

# Function to calculate similarity score between sentences and the abstractive summary (using GPU/CPU)
def calculate_similarity_score(judgement_sentences, summary, sentence_embeddings):
    # Move summary to device (GPU/CPU)
    summary_embedding = sentence_model.encode([summary], convert_to_tensor=True, show_progress_bar=False, device=device, batch_size=32)
    
    # Calculate cosine similarity between the abstractive summary and each sentence
    similarities = torch.nn.functional.cosine_similarity(sentence_embeddings, summary_embedding)
    return similarities

# Function to create extractive summarization based on similarity score
def generate_extractive_summary(judgement, summary, max_tokens=MAX_TOKENS):
    # Step 1: Split the judgement into sentences
    sentences = split_into_sentences(judgement)
    
    # Step 2: Get embeddings for each sentence
    sentence_embeddings = get_sentence_embeddings(sentences)
    
    # Step 3: Calculate similarity score between the abstractive summary and each sentence
    similarities = calculate_similarity_score(sentences, summary, sentence_embeddings)
    
    # Step 4: Select sentences based on similarity score
    selected_sentences = []
    total_tokens = 0
    
    for idx, score in enumerate(similarities):
        sentence = sentences[idx]
        sentence_tokens = len(sentence.split())  # Counting words as an approximation for token count
        
        # Stop if adding the sentence exceeds the token limit
        if total_tokens + sentence_tokens <= max_tokens:
            selected_sentences.append(sentence)
            total_tokens += sentence_tokens
        else:
            break
    
    # Return the extractive summary as the selected sentences
    extractive_summary = " ".join(selected_sentences)
    return extractive_summary

In [None]:
BATCH_SIZE = 32
SAVE_EVERY = 1000
PARTIAL_SAVE_PATH = "partial_train.json"

# loading existing data (resuming)
train_data_in_ext = []
start_idx = 0
if os.path.exists(PARTIAL_SAVE_PATH):
    with open(PARTIAL_SAVE_PATH, "r") as f:
        train_data_in_ext = json.load(f)
    start_idx = len(train_data_in_ext)
    print(f"Resuming from index {start_idx}")


for i in tqdm(range(start_idx, len(data_train))):
    try:
        entry = data_train[i]
        judgement = entry["judgement"]
        summary = entry["summary"]

        extractive_summary = generate_extractive_summary(judgement, summary, max_tokens=MAX_TOKENS)

        train_data_in_ext.append({
            "judgement": judgement,
            "summary": extractive_summary
        })

        # Periodic autosave
        if (i + 1) % SAVE_EVERY == 0:
            with open(PARTIAL_SAVE_PATH, "w") as f:
                json.dump(train_data_in_ext, f)
            print(f"Autosaved at index {i + 1}")

    except Exception as e:
        print(f"Failed at index {i}: {str(e)}")
        continue


In [None]:
import json

# Saving our final train extractive dataset, it took 8 hours for this to run and be created hence saving it
with open("final_train_data.json", "w") as f:
    json.dump(train_data_in_ext, f)

print("All done and saved!")

Similarly we do for test dataset

In [None]:
import json
BATCH_SIZE = 32
SAVE_EVERY = 1000
PARTIAL_SAVE_PATH = "partial_test.json"

test_data_in_ext = []
start_idx = 0
if os.path.exists(PARTIAL_SAVE_PATH):
    with open(PARTIAL_SAVE_PATH, "r") as f:
        test_data_in_ext = json.load(f)
    start_idx = len(test_data_in_ext)
    print(f"Resuming from index {start_idx}")


for i in tqdm(range(start_idx, len(data_test))):
    try:
        entry = data_test[i]
        judgement = entry["judgement"]
        summary = entry["summary"]

        extractive_summary = generate_extractive_summary(judgement, summary, max_tokens=MAX_TOKENS)

        test_data_in_ext.append({
            "judgement": judgement,
            "summary": extractive_summary
        })

        # Periodic autosave
        if (i + 1) % SAVE_EVERY == 0:
            with open(PARTIAL_SAVE_PATH, "w") as f:
                json.dump(test_data_in_ext, f)
            print(f"Autosaved at index {i + 1}")

    except Exception as e:
        print(f"Failed at index {i}: {str(e)}")
        continue

In [None]:
import json

# finally training our test extractive dataset
with open("final_test_data.json", "w") as f:
    json.dump(test_data_in_ext, f)

print("All done and saved!")

In [None]:
print(len(train_data_in_ext))
print(len(train_data_in_abs))

In [None]:
# We ran this later for loading our saved ext data

with open('/kaggle/input/final-train-data/final_train_data.json', 'r') as f:
    train_data_in_ext = json.load(f)  

with open('/kaggle/input/final-test-data/final_test_data.json', 'r') as f:
    test_data_in_ext = json.load(f)  

print("done")


In [None]:
print(train_data_in_abs[18361])
print(train_data_in_abs[6050])

In [None]:
del train_data_in_abs[18361]
del train_data_in_abs[6050]

Now we have extractive summarization models of judgements, now we need to replace the original judgements by these summarization models 

Now below we check whether there was any mismatch of data, if not then only we do the above replacement of abs dataset judgement by ext dataset summary

In [None]:
from tqdm import tqdm

# For train data

mismatch_index_train = []
mismatch_index_test = []

successful = 0
fail = 0


for i in tqdm(range(len(train_data_in_ext)), desc="Matching Training Judgements"):
    if train_data_in_ext[i]["judgement"] == train_data_in_abs[i]["judgement"]:
        train_data_in_abs[i]["judgement"] = train_data_in_ext[i]["summary"]
        successful += 1
    else:
        print(f"Mismatch in judgement at index {i} in training data")
        fail += 1
        mismatch_index_train.append(i)

print(f"\nMatching complete: {successful} matched, {fail} mismatches.")

In [None]:
successful = 0
fail = 0

# For testing data 
for i in tqdm(range(len(test_data_in_ext)), desc="Matching Testing Judgements"):
    if test_data_in_ext[i]["judgement"] == test_data_in_abs[i]["judgement"]:
        test_data_in_abs[i]["judgement"] = test_data_in_ext[i]["summary"]
        successful += 1
    else:
        print(f"Mismatch in judgement at index {i} in test data")
        fail += 1
        mismatch_index_test.append(i)

print(f"\nMatching complete: {successful} matched, {fail} mismatches.")


In [None]:
# Just random verifying

print(train_data_in_abs[40]["judgement"])
print()
print(train_data_in_abs[40]["summary"])
print()

print(train_data_in_ext[40]["summary"])

# Training our Extractive Summarization Model

In [None]:
from sentence_transformers import util
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset
import nltk
import torch
import random
from tqdm.notebook import tqdm as notebook_tqdm
from transformers import TrainerCallback
from tqdm import tqdm

# Your pretrained sentence transformer
# sentence_model = SentenceTransformer("Stern5497/sbert-legal-xlm-roberta-base")

# Classification model (LegalBERT)
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", num_labels=2)

from nltk.tokenize import sent_tokenize

def create_sentence_classification_dataset(dataset):
    data = []

    # We wrapped the dataset with tqdm to show progress bar for the entire loop
    for entry in tqdm(dataset, desc="Building sentence classification dataset", disable=False):
        judgement_sentences = sent_tokenize(entry["judgement"])
        summary_sentences = sent_tokenize(entry["summary"])

        if not judgement_sentences or not summary_sentences:
            continue

        summary_embeddings = sentence_model.encode(summary_sentences, convert_to_tensor=True, device="cuda")
        judgement_embeddings = sentence_model.encode(judgement_sentences, convert_to_tensor=True, device="cuda")

        cosine_sim_matrix = util.cos_sim(judgement_embeddings, summary_embeddings)

        for i, sentence in enumerate(judgement_sentences):
            similarity = cosine_sim_matrix[i].max().item()
            label = 1 if similarity > 0.7 else 0
            data.append({"text": sentence, "label": label})

    return data

def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)  # Each sentence is max 256 length


"""
This tokenize_fn function takes a single data example (which is a dictionary like {"text": sentence, "label": 0 or 1}) 
and tokenizes the sentence using your LegalBERT tokenizer so it can be fed into a transformer model.
"""

print("done")

In [None]:


from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", num_labels=2)

def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)  # Each sentence is max 256 length

In [None]:
# Creating sentence-level labeled data
sentence_data_train = create_sentence_classification_dataset(train_data_in_ext)
sentence_data_test = create_sentence_classification_dataset(test_data_in_ext)

print("done")

In [None]:
import json

with open("sentence_data_train.json", "w") as f:
    json.dump(sentence_data_train, f, indent=2)

with open("sentence_data_test.json", "w") as f:
    json.dump(sentence_data_test, f, indent=2)

print("done")

In [None]:

with open('/kaggle/input/sentence-tokenized-datasets/sentence_data_train.json', 'r') as f:
    sentence_data_train = json.load(f)  

with open('/kaggle/input/sentence-tokenized-datasets/sentence_data_test.json', 'r') as f:
    sentence_data_test = json.load(f)  

print("done")

In [None]:
# Converting to huggingFace dataset
train_data_in_ext = Dataset.from_list(sentence_data_train).map(tokenize_fn, batched=True)
test_data_in_ext = Dataset.from_list(sentence_data_test).map(tokenize_fn, batched=True)

print("done")

In [None]:
print(len(train_data_in_ext)) 
print(len(test_data_in_ext))   

In [None]:
from transformers import TrainingArguments, TrainerCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer

In [None]:
import transformers
print(transformers.__version__)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:

training_args = TrainingArguments(
    output_dir="./legalbert-extractive",
    # evaluation_strategy = "epoch", # We evaluate over validation set after each full epoch, this has nothing to do with validation set
    learning_rate = 2e-5, # How fast the model learns
    per_device_train_batch_size = 16, # We feed 16 samples at a time per GPU
    per_device_eval_batch_size = 16,
    weight_decay = 0.01, # Regularization to prevent overfitting
    save_total_limit = 1, # Only the last 1 checkpoints will be saved to the disk
    save_steps=1000, # we save checkpoints after 1000 steps
    num_train_epochs=3, # number of epochs
    # predict_with_generate = False, #  It makes sure the model uses .generate() internally for evaluation/prediction, so it can
    # fp16=True, # Since GPU is available
    logging_dir = "./extractive/logs", # Where it saves logs like loss, accuracy, evaluation scores.
    logging_steps = 1000, # Prints training loss for every 100 steps
    logging_strategy = "steps",
    report_to=[]
)

def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

class TQDMCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        print(f"Step {state.global_step} completed")

# Then in Trainer
trainer_extractive = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_in_ext,
    eval_dataset=test_data_in_ext,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[TQDMCallback()]  
)

print("done check v5")

In [None]:
print("Started")
trainer_extractive.train()
print("done")

In [None]:
trainer.save_model("./legalbert-extractive/final-model")
tokenizer.save_pretrained("./legalbert-extractive/final-model")
print("model saved succesfully!")

# Training Abstractive Summ model

In [None]:

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_list(train_data_in_abs)
test_dataset = Dataset.from_list(test_data_in_abs)

# Hugging face is just a more smarter version of pandas dataframe and more optimized for NLP training
# Hugging face Dataset can be thought as GPU-optimized, transformer-friendly version of a pandas dataframe


In [None]:
# Loading our pretrained Pegasus Model:
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large").to("cuda") # We let it use cuda since GPU available

In [None]:
# Tokenization
def preprocess_function(examples):
    model_inputs = tokenizer(examples["judgement"], max_length=1024, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=256, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def preprocess_function_chunked(examples):
    
# Tokenizing our datasets
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)


### Above input and output look like:
"""
Input:
examples["judgement"] = ["The judge ruled in favor of the plaintiff."]
examples["summary"] = ["Plaintiff wins the case."]

Output:
{
    "input_ids": [[101, 2023, 2134, 2003, 2087, 1996, 1063, 1012, 102, 0, ...]], # Generated by input data
    "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...]], # Generated by input data
    "labels": [[101, 2558, 2270, 1996, 2117, 102, 0, 0, ...]] # Generated by target data
}
"""

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="./legalbert-extractive",
    # evaluation_strategy = "epoch", # We evaluate over validation set after each full epoch, this has nothing to do with validation set
    learning_rate = 2e-5, # How fast the model learns
    per_device_train_batch_size = 16, # We feed 16 samples at a time per GPU
    per_device_eval_batch_size = 16,
    weight_decay = 0.01, # Regularization to prevent overfitting
    save_total_limit = 1, # Only the last 1 checkpoints will be saved to the disk
    save_steps=1000, # we save checkpoints after 1000 steps
    num_train_epochs=3, # number of epochs
    # predict_with_generate = False, #  It makes sure the model uses .generate() internally for evaluation/prediction, so it can
    # fp16=True, # Since GPU is available
    logging_dir = "./extractive/logs", # Where it saves logs like loss, accuracy, evaluation scores.
    logging_steps = 1000, # Prints training loss for every 100 steps
    logging_strategy = "steps",
    report_to=[]
)

# Now lets create our data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer_abstractive = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_test_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator
)

In [None]:
# Now lets do our training
trainer_abstractive.train() 


# Now lets save our model so that we can use it even after kernel restart
# Save the fine-tuned model
model.save_pretrained('./legal-pegasus-summarizer')
tokenizer.save_pretrained('./legal-pegasus-summarizer')

print("Abstractive model trained and saved successfully!!!")