<a href="https://colab.research.google.com/github/Krishan098/legaldocumentsimplifier/blob/main/legal_text_simplification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
!pip install transformers




In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [None]:
def bart_model(input_text):
    inputs = tokenizer(input_text, max_length=1024, return_tensors="pt", truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=512, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return output

In [None]:
import re
from nltk.tokenize import sent_tokenize

def preprocess(text):
    text = remove_citations(text)
    text = split_long_sentences(text)
    text = replace_legal_terms(text)
    text = standardize_structure(text)
    return text

def postprocess(text):
    text = fix_formatting(text)
    text = ensure_consistency(text)
    text = add_paragraph_breaks(text)
    return text

def remove_citations(text):
    return re.sub(r'\(\d+\s+[A-Za-z\.]+\s+\d+\)', '', text)

def split_long_sentences(text):
    sentences = sent_tokenize(text)
    processed_sentences = []

    for sentence in sentences:
        words = sentence.split()
        if len(words) > 50:
            splits = []
            current_split = []

            for token in sentence.split():
                current_split.append(token)
                if token == ',' or token == ';':
                    splits.append(' '.join(current_split))
                    current_split = []

            if current_split:
                splits.append(' '.join(current_split))
            processed_sentences.extend(splits)
        else:
            processed_sentences.append(sentence)

    return ' '.join(processed_sentences)

def replace_legal_terms(text):
    legal_terms = {
    'hereinafter': 'from now on',
    'pursuant to': 'according to',
    'whereas': 'since',
    'notwithstanding': 'despite',
    'forthwith': 'immediately',
    'inter alia': 'among other things',
    'ab initio': 'from the beginning',
    'ipso facto': 'by that fact itself',
    'mutatis mutandis': 'with the necessary changes',
    'de facto': 'in fact',
    'de jure': 'by law',
    'quid pro quo': 'something for something',
    'sub judice': 'under judicial consideration',
    'prima facie': 'at first glance',
    'pro rata': 'in proportion',
    'ultra vires': 'beyond the powers',
    'res judicata': 'a matter already judged',
    'a fortiori': 'even more so',
    'ex parte': 'by one party',
    'actus reus': 'guilty act',
    'mens rea': 'guilty mind',
    'nolo contendere': 'no contest',
    'stare decisis': 'to stand by decided cases',
    'in loco parentis': 'in the place of a parent',
    'per curiam': 'by the court',
    'amicus curiae': 'friend of the court',
    'sui generis': 'unique',
    'caveat emptor': 'let the buyer beware',
    'habeas corpus': 'you shall have the body',
    'ex post facto': 'after the fact',
    'in situ': 'in its original place',
    'pari passu': 'on equal footing',
    'lex loci': 'law of the place',
    'contra proferentem': 'against the drafter',
    'pro bono': 'for the public good',
    'ad hoc': 'for this specific purpose',
    'ex officio': 'by virtue of office',
    'jus cogens': 'compelling law',
    'locus standi': 'right to bring action',
    'nullum crimen sine lege': 'no crime without law',
}


    for term, replacement in legal_terms.items():
        text = re.sub(r'\b' + term + r'\b', replacement, text, flags=re.IGNORECASE)
    return text

def standardize_structure(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'Section \d+\.', lambda m: '\n' + m.group(0) + '\n', text)
    return text

def fix_formatting(text):
    text = '. '.join(s.strip().capitalize() for s in text.split('. '))
    text = re.sub(r'([.!?])\s*([A-Za-z])', r'\1 \2', text)
    return text

def ensure_consistency(text):
    return text

def add_paragraph_breaks(text):
    text = re.sub(r'([.!?])\s+(?=[A-Z])', r'\1\n\n', text)
    return text

def process_document(text, model):
    simplified_text = preprocess(text)
    model_output = bart_model(simplified_text)
    final_text = postprocess(model_output)
    return final_text


In [None]:
text = '''WHEREAS, the parties hereto agree to the terms and conditions set forth in this Agreement; and, pursuant to Section 12.3,
 all disputes arising hereunder shall be resolved through arbitration. NOTWITHSTANDING any provision to the contrary, the obligations
  herein shall commence forthwith. HEREINAFTER, the terms shall be interpreted according to the laws of the State of California.
   Section 14. This document also includes, inter alia, provisions for confidentiality and data protection.'''

In [None]:
res = process_document(text, model)
print(res)

Since, the parties hereto agree to the terms and conditions set forth in this agreement.

All disputes arising hereunder shall be resolved through arbitration.

From now on, the terms shall be interpreted according to the laws of the state of california.

This document also includes, among other things, provisions for confidentiality and data protection.


###Evaluation of fine tunned model

In [None]:
import torch

In [None]:
import zipfile
import os
def extract_zip(path):
    output_dir = f"./fine_tuned_model/{os.path.splitext(os.path.basename(path))[0]}"
    os.makedirs(output_dir, exist_ok=True)

    with zipfile.ZipFile(path, 'r') as zip_r:
        zip_r.extractall(output_dir)
    print(f"Extracted to {output_dir}")


In [None]:
extract_zip("/content/models-20250112T165419Z-001.zip")

Extracted to ./fine_tuned_model/models-20250112T165419Z-001


In [None]:
extract_zip("/content/results-20250112T165420Z-001 (1).zip")

Extracted to ./fine_tuned_model/results-20250112T165420Z-001 (1)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
#load this model
def load_fined_model():
  model = AutoModelForSeq2SeqGeneration.from_pretrained('./content/fine_tuned_model/models-20250112T165419Z-001')
  tokenizer = AutoTokenizer.from_pretrained('./content/fine_tuned_model/models-20250112T165419Z-001')
  model.to(device)

In [None]:
#rogue metrics
def rogue_met():
    rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    smoothing = SmoothingFunction().method1



In [None]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=2236ee5ebf28937595aeb97ddd1d08ab65f33a986b8b0c449607986091b830b4
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration
import zipfile
import json
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import pandas as pd
from tqdm import tqdm

class ModelEvaluator:
    def __init__(self, model_zip_path, results_zip_path):
        self.setup_files(model_zip_path, results_zip_path)
        self.initialize_model()
        self.initialize_metrics()

    def setup_files(self, model_zip_path, results_zip_path):
        with zipfile.ZipFile(model_zip_path, 'r') as zip_ref:
            zip_ref.extractall('./model_files')

        with zipfile.ZipFile(results_zip_path, 'r') as zip_ref:
            zip_ref.extractall('./results')

        try:
            with open('./results/test_data.json', 'r') as f:
                self.test_data = json.load(f)
        except:
            print("No test data found in results zip. Only inference will be available.")
            self.test_data = None

    def initialize_model(self):
        self.model = T5ForConditionalGeneration.from_pretrained('./model_files')
        self.tokenizer = AutoTokenizer.from_pretrained('./model_files')
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        print(f"Model loaded and running on {self.device}")

    def initialize_metrics(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.smoothing = SmoothingFunction().method1

    def generate_simplified_text(self, input_text, max_length=512):
        inputs = self.tokenizer(input_text,
                              max_length=max_length,
                              truncation=True,
                              padding='max_length',
                              return_tensors='pt')

        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_length=max_length,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def calculate_metrics(self, reference, hypothesis):
        bleu = sentence_bleu([reference.split()],
                           hypothesis.split(),
                           smoothing_function=self.smoothing)
        rouge_scores = self.rouge_scorer.score(reference, hypothesis)

        return {
            'bleu': bleu,
            'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure,
            'rougeL': rouge_scores['rougeL'].fmeasure
        }

    def evaluate_model(self):
        if not self.test_data:
            print("No test data available for evaluation")
            return None

        results = []
        for item in tqdm(self.test_data, desc="Evaluating"):
            input_text = item['input']
            reference = item['target']
            prediction = self.generate_simplified_text(input_text)
            metrics = self.calculate_metrics(reference, prediction)
            results.append({
                'input': input_text,
                'reference': reference,
                'prediction': prediction,
                **metrics
            })

        df = pd.DataFrame(results)
        avg_metrics = {
            'avg_bleu': df['bleu'].mean(),
            'avg_rouge1': df['rouge1'].mean(),
            'avg_rouge2': df['rouge2'].mean(),
            'avg_rougeL': df['rougeL'].mean()
        }

        return df, avg_metrics

    def save_evaluation_results(self, df, avg_metrics, output_path='evaluation_results'):
        os.makedirs(output_path, exist_ok=True)
        df.to_csv(f'{output_path}/detailed_results.csv', index=False)
        with open(f'{output_path}/average_metrics.json', 'w') as f:
            json.dump(avg_metrics, f, indent=4)

    def simplify_new_text(self, text):
        return self.generate_simplified_text(text)

def main():
    evaluator = ModelEvaluator(
        model_zip_path='/content/models-20250112T165419Z-001.zip',
        results_zip_path='/content/results-20250112T165420Z-001 (1).zip'
    )

    results = evaluator.evaluate_model()
    if results:
        df, avg_metrics = results
        evaluator.save_evaluation_results(df, avg_metrics)
        print("\nAverage Metrics:", json.dumps(avg_metrics, indent=2))

    text = '''WHEREAS, the parties hereto agree to the terms and conditions set forth in this Agreement; and, pursuant to Section 12.3,
 all disputes arising hereunder shall be resolved through arbitration. NOTWITHSTANDING any provision to the contrary, the obligations
  herein shall commence forthwith. HEREINAFTER, the terms shall be interpreted according to the laws of the State of California.
   Section 14. This document also includes, inter alia, provisions for confidentiality and data protection.'''
    simplified = evaluator.simplify_new_text(sample_text = text)
    print("\nSimplified text:", simplified)

if __name__ == "__main__":
    main()


No test data found in results zip. Only inference will be available.


OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory ./model_files.