**Legal Document Analysis and Summarization**

**Objective:** Develop a tool that automatically analyzes and summarizes lengthy legal documents.
**Details:** Use an LLM to extract key clauses, summarize legal texts, and provide context-specific suggestions. This could be helpful for lawyers and legal professionals to quickly understand contracts, agreements, and case files.
Technology Stack: Python, Hugging Face Transformers, GPT models, Streamlit/Django for the interface.

In [None]:
!pip install streamlit
!pip install pdfplumber
!pip install langchain
!pip install langchain-community
!pip install pypdf
!pip install transformers
!pip install torch
!pip install boto3
!pip install sagemaker
!pip install pyngrok

Collecting streamlit
  Downloading streamlit-1.38.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting tenacity<9,>=8.1.0 (from streamlit)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.2-py3-none-manylinux2014_x86_64.whl.metadata (38 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading streamlit-1.38.0-py2.py3-none-any.whl (8.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m27.3 MB

In [None]:
import streamlit as st
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.chains.summarize import load_summarize_chain
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import pipeline
import torch
import base64
import re


In [None]:
# Ensure pypdf is installed
try:
    import pypdf
except ImportError:
    os.system('pip install pypdf')
    import pypdf

In [None]:
# MODEL AND TOKENIZER
checkpoint = "MBZUAI/LaMini-Flan-T5-248M"
tokenizer = T5Tokenizer.from_pretrained(checkpoint)
base_model = T5ForConditionalGeneration.from_pretrained(checkpoint, device_map = 'auto', torch_dtype = torch.float32)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

In [None]:
import pdfplumber

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    full_text = []
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text.append(text)
    return "".join(full_text)



In [None]:
# FILE LODER AND PROCESSING
def file_preprocessing(file):
    loder = PyPDFLoader(file)
    pages = loder.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 50)
    texts = text_splitter.split_documents(pages)
    final_texts = ""
    for text in texts:
        print(text)
        final_texts = final_texts + text.page_content
    return final_texts


In [None]:
!pip install transformers
# Import the necessary modules
from transformers import TrainingArguments, Trainer



In [None]:
# LM PIPELINE
def llm_pipeline(filepath):
    pipe_sum = pipeline(
        'summarization',
        model = base_model,
        tokenizer = tokenizer,
        max_length = 500,
        min_length = 50
    )
    input_text = file_preprocessing(filepath)
    print(f"Input text for summarization: {input_text[:500]}...")
    result = pipe_sum(input_text)
    result = result[0]['summary_text']
    return result

In [None]:
@st.cache_data
#FUNCTION TO DISPLAY THE PDF OF A GIVEN FILE
def displayPDF(file):
    # opening file from file path
    with open(file, "rb") as f:
        base64_pdf = base64.b64encode(f.read()).decode('utf-8')

    # embedding PDF in HTML
    pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'

    # displaying file
    st.markdown(pdf_display, unsafe_allow_html=True)



In [None]:
# STREAMLIT CODE
st.set_page_config(layout= 'wide', page_title="Summarization App")

def main():
    st.title("DOCUMENT SUMMARIZATION APP USING LANGUAGE MODEL")

    uploaded_file = st.file_uploader("Upload your PDF file", type = ['pdf'])

    if uploaded_file is not None:
        if st.button("Summarize"):
            col1, col2 = st.columns(2)
            filepath = "data/" + uploaded_file.name

            # ensure the directory exist
            os.makedirs(os.path.dirname(filepath), exist_ok=True)

            # save the file
            with open(filepath, "wb") as temp_file:
                temp_file.write(uploaded_file.read())
            with col1:
                st.info("Uploaded PDF File")
                pdf_viewer = displayPDF(filepath)

            with col2:
                st.info("Summarized Your PDf Below")

                summary = llm_pipeline(filepath)
                st.success(summary)

if __name__ == '__main__':
    main()


2024-09-12 13:31:16.873 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


In [None]:
# Install necessary packages
!pip install streamlit pyngrok
!pip install streamlit
!pip install pyngrok
from pyngrok import ngrok



In [None]:
# Save the Streamlit script
with open('app.py', 'w') as f:
    f.write("""
import streamlit as st
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.chains.summarize import load_summarize_chain
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import pipeline
import torch
import base64

# MODEL AND TOKENIZER
checkpoint = "MBZUAI/LaMini-Flan-T5-248M"
tokenizer = T5Tokenizer.from_pretrained(checkpoint)
base_model = T5ForConditionalGeneration.from_pretrained(checkpoint, device_map = 'auto', torch_dtype = torch.float32)

import pdfplumber

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    full_text = []
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text.append(text)
    return "".join(full_text)

# Ensure pypdf is installed
try:
    import pypdf
except ImportError:
    os.system('pip install pypdf')
    import pypdf

# FILE LODER AND PROCESSING
def file_preprocessing(file):
    loder = PyPDFLoader(file)
    pages = loder.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter (chunk_size = 200, chunk_overlap = 50)
    texts = text_splitter.split_documents(pages)
    final_texts = ""
    for text in texts:
        print(text)
        final_texts = final_texts + text.page_content
    return final_texts

# LM PIPELINE
def llm_pipeline(filepath):
    pipe_sum = pipeline(
        'summarization',
        model = base_model,
        tokenizer = tokenizer,
        max_length = 500,
        min_length = 50
    )
    input_text = file_preprocessing(filepath)
    print(f"Input text for summarization: {input_text[:500]}...")
    result = pipe_sum(input_text)
    result = result[0]['summary_text']
    return result
@st.cache_data
#FUNCTION TO DISPLAY THE PDF OF A GIVEN FILE
def displayPDF(file):
    # opening file from file path
    with open(file, "rb") as f:
        base64_pdf = base64.b64encode(f.read()).decode('utf-8')

    # embedding PDF in HTML
    pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'

    # displaying file
    st.markdown(pdf_display, unsafe_allow_html=True)
# STREAMLIT CODE
st.set_page_config(layout= 'wide', page_title="Summarization App")

def main():
    st.title("DOCUMENT SUMMARIZATION APP USING LANGUAGE MODEL")

    uploaded_file = st.file_uploader("Upload your PDF file", type = ['pdf'])

    if uploaded_file is not None:
        if st.button("Summarize"):
            col1, col2 = st.columns(2)
            filepath = "data/" + uploaded_file.name
            # ensure the directory exist
            os.makedirs(os.path.dirname(filepath), exist_ok=True)

            # save the file

            with open(filepath, "wb") as temp_file:
                temp_file.write(uploaded_file.read())
            with col1:
                st.info("Uploaded PDF File")
                pdf_viewer = displayPDF(filepath)

            with col2:
                st.info("Summarized Your PDf Below")

                summary = llm_pipeline(filepath)
                st.success(summary)

if __name__ == '__main__':
    main()

""")


In [None]:
!pip install streamlit -q

In [None]:
!streamlit run app.py &>/dev/null&

1. Signup the ngrok account.
2. Past it your authtoken key.

In [None]:
!ngrok config add-authtoken 2lxjTdePIhpmmwK4wlg5BlN1qTQ_7TT67qFuKaWXcTLSwYh65
!streamlit run app.py &>/dev/null&
public_url = ngrok.connect(8501)
print(f"Streamlit app is live at {public_url}")

Note: If your documents contain any links, the server will be interrupted and the app will not function properly. Make sure that your document only contains text.