In [1]:
# install dependencies
# !python.exe -m pip install --upgrade pip

# %pip install pandas
# %pip install nltk
# %pip install numpy --only-binary :all:
# %pip install transformers sumy sentencepiece 
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# %pip install notebook ipywidgets --upgrade
# %pip install language_tool_python

In [2]:
# init variables
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

PROCESSING_FILE_PATH = 'resources/cases_2024.json'
PROCESSING_CASE_INDEX = 0
SUMMARY_MODELS = ['facebook/bart-large-cnn', 'google/pegasus-xsum', 't5-base', 'allenai/led-base-16384']

In [3]:
# load to dataframes
import json
import pandas as pd

with open(PROCESSING_FILE_PATH, 'r', encoding='utf-8') as f:
    cases_data = json.load(f)

cases_df = pd.DataFrame(cases_data)
print(cases_df.head())

                                     id filename primaryLang  \
0  d66a6895-c339-4bd0-9992-790b7b5f4a17      cpa        0132   
1  4aaafdf5-8ac9-4086-b62e-485d250b02bb    court          of   
2  f81236b6-7c88-4337-9701-772651a56abe       ca        writ   
3  0fe6fe07-fd7d-4b4c-a5d3-3644e9c56b56      wrt        0201   
4  b655451f-cad0-4cc4-b5ce-6bc81dbbee30     writ         123   

                                                text  wordCount  
0  Page 1 of 11 \n In the cozy appeal of the demo...       2854  
1  CA/HCC 184/2017  \n \n1 | P a g e  \n  IN THE ...       4330  
2  Page 1 of 11 \n IN THE COURT OF APPEAL OF THE ...       3300  
3  Page 1 of 15 \n IN THE COURT OF APPEAL OF THE ...       4121  
4  1 \n IN THE COURT OF APPEAL OF THE DEMOCRATIC ...       3898  


In [4]:
# sentence tokenize
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt_tab', 'resources/dependencies/nltk')

cases_df['sentences'] = cases_df['text'].apply(sent_tokenize)
print(cases_df['sentences'])

[nltk_data] Downloading package punkt_tab to
[nltk_data]     resources/dependencies/nltk...
[nltk_data]   Package punkt_tab is already up-to-date!


0      [Page 1 of 11 \n In the cozy appeal of the dem...
1      [CA/HCC 184/2017  \n \n1 | P a g e  \n  IN THE...
2      [Page 1 of 11 \n IN THE COURT OF APPEAL OF THE...
3      [Page 1 of 15 \n IN THE COURT OF APPEAL OF THE...
4      [1 \n IN THE COURT OF APPEAL OF THE DEMOCRATIC...
                             ...                        
524    [CA/HCC/100 -2020  \n \n1 | P a g e  \n  IN TH...
525    [CP(PHC )APN  144/2022 \n \n1 | P a g e  \n IN...
526    [CA/HCC/327/19  \n \n1 | P a g e  \n IN THE CO...
527    [C.A., WRIT  88-2019 \n \n 1 \n IN THE COURT O...
528    [1 \n IN THE COURT OF APPEAL OF THE DEMOCRATIC...
Name: sentences, Length: 529, dtype: object


In [5]:
# ============== extractive summarization (unsupervised) ==============
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

text = cases_df.loc[PROCESSING_CASE_INDEX, 'text']
parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = LexRankSummarizer()

summary = summarizer(parser.document, sentences_count=5)
for sentence in summary:
    print(sentence)

Page 1 of 11 In the cozy appeal of the democratic socialist republican Of Sri Lanka In The MATTER OF ANPLICATION FOR REVISION in Terms of Artiction 138 of The Constitionion from the order for cancellalation of bail delished by the Learned High JUDGE of Kandy As Dated 05th October 2023, In High court in kandy case no.
VS. High court kandy Case NO.
Page 3 of 11 The Petitioner is Seeking to Challenge The Order Made by the Learned High by the Learned High Court JUDGE OF KANDY ON 5th October 2023, WHERE THE ACCUSED -RESPONDENT (Hereinafter REFERED TO ASE The ACCUDED TO BE REMANDED FURTHER TRIAL.
Page 4 of 11 WHEN THE ABOE Application Was Made, The Learned High JUDGE HAS MADE The impugned order to be challenged before on behalf of the accused.
ITPeaars from the documents TENDERED TO THIS COURT, THAT ON 31 -20220, The Attorney -law For the Accuseed has Filed A Motion Seaking To Make an Applics For Bail For Bail For The ACCUSED BEFORE The high court on 01-211-2023.


In [6]:
# ============== Extractive Summarization (Hierarchical Approach) ==============
from transformers import pipeline, AutoTokenizer
import torch

# Chunking function with overlap to maintain context
def chunk_text(text, tokenizer, max_tokens=900, overlap_tokens=100):
    tokens = tokenizer.encode(text)
    total_tokens = len(tokens)
    chunks = []
    start = 0
    while start < total_tokens:
        end = min(start + max_tokens, total_tokens)
        chunk = tokenizer.decode(tokens[start:end], skip_special_tokens=True)
        chunks.append(chunk)
        if end == total_tokens:
            break
        start += (max_tokens - overlap_tokens)
    return chunks

# Initialize summarizer (GPU or CPU)
import torch
device = 0 if torch.cuda.is_available() else -1

model_name = SUMMARY_MODELS[0]
tokenizer = AutoTokenizer.from_pretrained(model_name)
summarizer = pipeline("summarization", model=model_name, device=device)
text = cases_df.loc[PROCESSING_CASE_INDEX, 'text']

# Hierarchical summarization function with dynamic max_length
def hierarchical_summary(text, summarizer, tokenizer, 
                         max_chunk_tokens=900, overlap_tokens=100,
                         chunk_summary_max_len=150, final_summary_max_len=200):

    # Chunk the original document
    chunks = chunk_text(text, tokenizer, max_tokens=max_chunk_tokens, overlap_tokens=overlap_tokens)

    print(f"Number of chunks: {len(chunks)}")

    # Summarize each chunk individually
    intermediate_summaries = []
    for i, chunk in enumerate(chunks):
        print(f"Summarizing chunk {i+1}/{len(chunks)}...")
        chunk_len = len(tokenizer.encode(chunk))
        adjusted_max_len = min(chunk_summary_max_len, max(30, int(chunk_len * 0.5)))

        summary = summarizer(
            chunk, 
            max_length=adjusted_max_len, 
            min_length=min(20, adjusted_max_len//2), 
            do_sample=False
        )[0]['summary_text']

        intermediate_summaries.append(summary)

    # Combine intermediate summaries
    combined_summary_text = " ".join(intermediate_summaries)

    # Generate final summary from intermediate summaries
    print("Generating final summary...")
    final_summary = summarizer(
        combined_summary_text, 
        max_length=final_summary_max_len, 
        min_length=100, 
        do_sample=False
    )[0]['summary_text']

    return final_summary

# Example Usage
final_summary = hierarchical_summary(
    text, summarizer, tokenizer, 
    max_chunk_tokens=900,
    overlap_tokens=100, 
    chunk_summary_max_len=150, 
    final_summary_max_len=200
)

print("Final Summary:")
print(final_summary)

Device set to use cuda:0


Number of chunks: 7
Summarizing chunk 1/7...
Summarizing chunk 2/7...
Summarizing chunk 3/7...
Summarizing chunk 4/7...
Summarizing chunk 5/7...
Summarizing chunk 6/7...
Summarizing chunk 7/7...
Generating final summary...
Final Summary:
Petitioner is seeking to challenge the Order Made by the Learned High. The Attorney General, the Attorney General's department, is the respondent. The Accused has been charged with Grave Sexual Abuse o f a minor. He has been remanded on bail until the end of the trial. I am of the view that the learned high judge WAS MISDIRECTED. The onely Assumtion That CAN BE MADE is that the Remanding of The Accuses For A Period of 3 months HAD BEEN DONE AS A PUNITIVE measure. The order mode by this court previcly on 15 -12-2023, to release the Accused.


In [13]:
from transformers import pipeline
import torch

# Use GPU if available
device = 0 if torch.cuda.is_available() else -1

# Initialize refinement pipeline
refiner = pipeline("text2text-generation", model="google/flan-t5-base", device=device)

def refine_legal_summary(summary_text):
    prompt = (
        "Refine the following legal summary. Correct grammar, spelling, punctuation, "
        "remove repetition, and ensure clarity without changing any legal meaning:\n\n"
        f"{summary_text}"
    )

    refined_output = refiner(prompt, max_length=256, do_sample=False)
    refined_summary = refined_output[0]['generated_text']

    return refined_summary.strip()

# Example usage:
clean_summary = refine_legal_summary(final_summary)
print("Refined Final Summary:")
print(clean_summary)

Device set to use cuda:0


Refined Final Summary:
Petitioner is seeking to challenge the Order Made by the Learned High. The Attorney General, the Attorney General's department, is the respondent. The Accused has been charged with Grave Sexual Abuse o f a minor. He has been remanded on bail until the end of the trial. I am of the view that the learned high judge WAS MISDIRECTED. The onely Assumtion That CAN BE MADE is that the Remanding of The Accuses For A Period of 3 months HAD BEEN DONE AS A PUNITIVE measure. The order mode by this court previcly on 15 -12-2023, to release the Accused.
