In [29]:
import os
import re
import tabula
import pdfplumber
import pytesseract
from PIL import Image
import io
import spacy
import nltk
import string
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Load Spacy model for Named Entity Recognition
nlp = spacy.load('en_core_web_sm')

# Function for text normalization
def text_normalization(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to lemmatize tokens
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    pos_tagged = pos_tag(tokens)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tagged]
    return lemmatized_words

# Function to get WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Function for Named Entity Recognition (NER) using Spacy
def named_entity_recognition(text):
    doc = nlp(text)
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    return entities

# Function to remove stopwords
def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

# Function to extract tables using Tabula-py and capture context around the table
def extract_tables_and_context(pdf_path, page_text, page_num, previous_page_text=None, lines_above=3, lines_below=3):
    tables = tabula.read_pdf(pdf_path, pages=page_num, multiple_tables=True)
    table_list = []
    all_lines = page_text.splitlines()

    if tables:
        for i, table in enumerate(tables):
            if i == 0 and previous_page_text:
                previous_page_lines = previous_page_text.splitlines()
                context_above = "\n".join(previous_page_lines[-lines_above:])
            else:
                context_above = "\n".join(all_lines[max(0, i - lines_above):i])
            context_below = "\n".join(all_lines[i + len(table):i + len(table) + lines_below])

            table_list.append({
                "table_number": i + 1,
                "table_data": table,
                "context_above": context_above,
                "context_below": context_below
            })
    return table_list

# Function to extract PDF content with OCR and tables with context
def extract_pdf_content_with_ocr(pdf_path, lines_above=3, lines_below=3):
    full_text = ""
    table_context_data = []
    previous_page_text = None

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            page_text = page.extract_text()

            if page_text:
                full_text += page_text + "\n\n"
                tables_with_context = extract_tables_and_context(pdf_path, page_text, page_num + 1, previous_page_text, lines_above, lines_below)
                table_context_data.append({
                    "page_number": page_num + 1,
                    "tables": tables_with_context
                })
                previous_page_text = page_text
            else:
                page_image = page.to_image()
                image_bytes = page_image.original
                img = Image.open(io.BytesIO(image_bytes))
                ocr_text = pytesseract.image_to_string(img)
                full_text += ocr_text + "\n\n"

    return full_text, table_context_data

# Function to process PDF files and extract tables with context
def process_files(pdf_directory):
    file_names = [f for f in os.listdir(pdf_directory) if f.lower().endswith('.pdf')]
    all_preprocessed_data = []

    for file_name in file_names:
        pdf_path = os.path.join(pdf_directory, file_name)

        try:
            pdf_content, table_context_data = extract_pdf_content_with_ocr(pdf_path)
            normalized_text = text_normalization(pdf_content)
            words = word_tokenize(normalized_text)
            words = remove_stopwords(words)
            lemmatized_words = lemmatize_tokens(words)
            named_entities = named_entity_recognition(normalized_text)

            all_preprocessed_data.append({
                "file_name": file_name,
                "normalized_text": normalized_text,
                "lemmatized_words": list(lemmatized_words),
                "named_entities": named_entities,
                "table_context_data": table_context_data
            })

        except Exception as e:
            print(f"Error processing file {file_name}: {e}")
            continue

    return all_preprocessed_data


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
import faiss
import numpy as np
from sklearn.cluster import KMeans

# Function to initialize FAISS index
def initialize_faiss_index(embedding_dimension):
    index = faiss.IndexFlatL2(embedding_dimension)
    return index

# Function to add document embeddings to the FAISS index
def add_embeddings_to_faiss(index, embeddings):
    index.add(np.array(embeddings).astype(np.float32))
    print(f"Added {len(embeddings)} embeddings to the FAISS index.")

# Function to search within FAISS index for top-k documents
def search_faiss(index, query_embedding, top_k=5):
    distances, indices = index.search(np.array([query_embedding]).astype(np.float32), top_k)
    return distances, indices

# Function to perform K-Means clustering and return clusters
def perform_kmeans_clustering(embeddings, num_clusters):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(embeddings)
    return clusters, kmeans


In [33]:
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf

# Load BERT tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = TFAutoModel.from_pretrained(model_name)

# Function to get document embedding
def get_document_embedding(text):
    inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True)
    outputs = bert_model(inputs['input_ids'])
    
    doc_embedding = tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy()
    doc_embedding = np.squeeze(doc_embedding)
    
    return doc_embedding

# Step to prepare FAISS index and K-Means clusters
def prepare_faiss_and_clusters(all_preprocessed_data, num_clusters):
    all_document_embeddings = [get_document_embedding(data['normalized_text']) for data in all_preprocessed_data]

    # Initialize FAISS index and add embeddings
    faiss_index = initialize_faiss_index(embedding_dimension=768)
    add_embeddings_to_faiss(faiss_index, all_document_embeddings)

    # Perform K-Means clustering on the embeddings
    clusters, kmeans_model = perform_kmeans_clustering(all_document_embeddings, num_clusters)

    return faiss_index, all_document_embeddings, clusters, kmeans_model


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

# Function to get BERT embeddings for sentences
def get_bert_embeddings(sentences):
    inputs = tokenizer(sentences, return_tensors="tf", padding=True, truncation=True)
    outputs = bert_model(inputs['input_ids'])
    embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy()
    return embeddings

# Function for BERT-based extractive summarization with Named Entities Priority
def bert_extractive_summary(normalized_text, named_entities, max_sentences=3):
    sentences = sent_tokenize(normalized_text)

    # Get embeddings for sentences and document
    sentence_embeddings = get_bert_embeddings(sentences)
    document_embedding = get_document_embedding(normalized_text)

    # Calculate cosine similarity between each sentence and document embedding
    similarity_scores = cosine_similarity(sentence_embeddings, document_embedding.reshape(1, -1))

    # Add weights based on the presence of named entities
    entity_scores = []
    for sentence in sentences:
        score = sum(1 for entity in named_entities if entity[0] in sentence)
        entity_scores.append(score)

    final_scores = similarity_scores.flatten() + np.array(entity_scores)
    ranked_sentences = [sentences[i] for i in np.argsort(final_scores)[::-1]]
    summary = " ".join(ranked_sentences[:max_sentences])
    return summary


In [37]:
import requests
import json

# Define the OLLAMA API URL for the abstractive summary
OLLAMA_API_URL = "http://127.0.0.1:11434/api/generate"

# Function to generate an abstractive summary using LLaMA 3.1
def generate_llama_summary(text, model="llama3.1"):
    headers = {
        'Content-Type': 'application/json',
    }
    data = {
        "model": model,
        "prompt": f"Summarize the following text:\n\n{text}\n\nAbstract Summary:"
    }

    response = requests.post(OLLAMA_API_URL, headers=headers, data=json.dumps(data), stream=True)

    if response.status_code == 200:
        final_summary = ""
        for line in response.iter_lines():
            if line:
                try:
                    data = json.loads(line.decode('utf-8'))
                    final_summary += data.get("response", "")
                    if data.get("done", False):
                        break
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON line: {line}, Error: {e}")
                    continue

        return final_summary.strip()
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return "Error in generating summary."


In [70]:
def dynamic_summary_mode(all_preprocessed_data, faiss_index, clusters, kmeans_model, mode="query", query=None, category="IP", top_k=2):
    """
    Generates extractive and abstractive summaries based on user query or full-document summarization.
    
    Parameters:
    all_preprocessed_data: List of preprocessed document data.
    faiss_index: FAISS index for fast vector search.
    clusters: Cluster labels for documents.
    kmeans_model: Trained K-Means model.
    mode: "query" for query-based summary, "full" for full-document summaries.
    query: Optional user query for summarization.
    top_k: Number of top documents to return.
    """

    if mode == "query" and query:
        print(f"Processing query: {query}")
        query_embedding = get_document_embedding(query)

        # Predict the cluster the query belongs to
        query_cluster = kmeans_model.predict([query_embedding])[0]
        print(f"Query belongs to cluster: {query_cluster}")

        # Get the indices of documents that belong to the same cluster
        relevant_indices = [i for i, cluster in enumerate(clusters) if cluster == query_cluster]

        # Reuse the precomputed embeddings for the documents in the same cluster
        cluster_embeddings = np.array([get_document_embedding(all_preprocessed_data[i]['normalized_text']) for i in relevant_indices])

        # Create a FAISS index for the specific cluster once
        cluster_faiss_index = initialize_faiss_index(embedding_dimension=768)
        add_embeddings_to_faiss(cluster_faiss_index, cluster_embeddings)

        # Search within this cluster in FAISS
        distances, cluster_specific_indices = search_faiss(cluster_faiss_index, query_embedding, top_k=top_k)

        # Retrieve the top documents from the cluster-specific FAISS index
        for i in cluster_specific_indices[0]:
            print(f"luster-specific FAISS index: {i}")
            original_index = relevant_indices[i]
            doc = all_preprocessed_data[original_index]

            # Use precomputed extractive summaries and LLaMA summaries to avoid recomputation
            print(f"\nDocument: {doc['file_name']}\n")
            
            # Generate extractive summary only when necessary
            # extractive_summary = bert_extractive_summary(doc['normalized_text'], doc['named_entities'])
            # print(f"Extractive Summary: {extractive_summary}\n")

            # Abstractive summary using LLaMA
            abstractive_summary = generate_llama_summary(doc['normalized_text'])
            print(f"Abstractive Summary: {abstractive_summary}\n")

    elif mode == "full":
        print("Generating full summaries for all documents...")

        for doc in all_preprocessed_data:
            extractive_summary = bert_extractive_summary(doc['normalized_text'], doc['named_entities'])
            abstractive_summary = generate_llama_summary(doc['normalized_text'])
            print(f"\nDocument: {doc['file_name']}\nExtractive Summary: {extractive_summary}\nAbstractive Summary: {abstractive_summary}\n")

    else:
        print(f"Invalid mode '{mode}' or missing query.")


In [45]:
# Example usage for "IP" category of documents
pdf_directory = "D:\\AI_ML - PG\\Capstone Project - Automated Legal document Segmentation\\files_export\\data\\Documents\\IP"

# Step 1: Preprocess the files
all_preprocessed_data = process_files(pdf_directory)



# # Step 4: Full-document summarization without querying FAISS
# dynamic_summary_mode(all_preprocessed_data=all_preprocessed_data, faiss_index=faiss_index, clusters=clusters, kmeans_model=kmeans_model, mode="full")


Failed to import jpype dependencies. Fallback to subprocess.
No module named 'jpype'
Got stderr: Sep 21, 2024 4:46:31 PM org.apache.pdfbox.pdmodel.font.PDFont loadUnicodeCmap
Sep 21, 2024 4:46:31 PM org.apache.pdfbox.pdmodel.font.PDFont loadUnicodeCmap
Sep 21, 2024 4:46:31 PM org.apache.pdfbox.pdmodel.font.PDFont loadUnicodeCmap
Sep 21, 2024 4:46:31 PM org.apache.pdfbox.pdmodel.font.PDFont loadUnicodeCmap

Got stderr: Sep 21, 2024 4:46:32 PM org.apache.pdfbox.pdmodel.font.PDFont loadUnicodeCmap
Sep 21, 2024 4:46:32 PM org.apache.pdfbox.pdmodel.font.PDFont loadUnicodeCmap
Sep 21, 2024 4:46:33 PM org.apache.pdfbox.pdmodel.font.PDFont loadUnicodeCmap
Sep 21, 2024 4:46:33 PM org.apache.pdfbox.pdmodel.font.PDFont loadUnicodeCmap

Got stderr: Sep 21, 2024 4:46:34 PM org.apache.pdfbox.pdmodel.font.PDFont loadUnicodeCmap
Sep 21, 2024 4:46:34 PM org.apache.pdfbox.pdmodel.font.PDFont loadUnicodeCmap
Sep 21, 2024 4:46:34 PM org.apache.pdfbox.pdmodel.font.PDFont loadUnicodeCmap
Sep 21, 2024 4:46:3

In [47]:
# Print all preprocessed data
for data in all_preprocessed_data:
    print(f"\nFile: {data['file_name']}")
    print(f"Normalized Text:\n{data['normalized_text'][:500]}")
    # print(f"Lemmatized Words:\n{data['lemmatized_words'][:20]}")
    # print(f"Named Entities:\n{data['named_entities']}")
    # for context_data in data['table_context_data']:
    #     print(f"Page {context_data['page_number']}:")
    #     for table in context_data['tables']:
    #         print(f"Table {table['table_number']}:\n{table['table_data']}")
    #         print(f"Context Above:\n{table['context_above']}")
    #         print(f"Context Below:\n{table['context_below']}")


File: ARMSTRONGFLOORING,INC_01_07_2019-EX-10.2-INTELLECTUAL PROPERTY AGREEMENT.PDF
Normalized Text:
exhibit execution version intellectual property agreement this intellectual property agreement this “agreement” dated as of december the “effective date” is entered into by and between armstrong flooring inc a delaware corporation “seller” and afi licensing llc a delaware limited liability company “licensing” and together with seller “arizona” and ahf holding inc formerly known as tarzan holdco inc a delaware corporation “buyer” and armstrong hardwood flooring company a tennessee corporation the

File: ArmstrongFlooringInc_20190107_8-K_EX-10.2_11471795_EX-10.2_Intellectual Property Agreement.pdf
Normalized Text:
exhibit execution version intellectual property agreement this intellectual property agreement this “agreement” dated as of december the “effective date” is entered into by and between armstrong flooring inc a delaware corporation “seller” and afi licensing llc a delaware limite

In [49]:
# Step 2: Prepare FAISS index and K-Means clusters
num_clusters = 2  # Example: set number of clusters
faiss_index, all_document_embeddings, clusters, kmeans_model = prepare_faiss_and_clusters(all_preprocessed_data, num_clusters)

Added 14 embeddings to the FAISS index.




In [72]:
# # Step 3: Run dynamic summary mode using FAISS and clusters for query-based summarization
query = "intellectual property law"
dynamic_summary_mode(all_preprocessed_data=all_preprocessed_data, faiss_index=faiss_index, clusters=clusters, kmeans_model=kmeans_model, mode="query", query=query, top_k=5)


Processing query: intellectual property law
Query belongs to cluster: 1
Added 11 embeddings to the FAISS index.
luster-specific FAISS index: 3

Document: INGEVITYCORP_05_16_2016-EX-10.5-INTELLECTUAL PROPERTY AGREEMENT.PDF

Abstractive Summary: This Intellectual Property Agreement is between WestRock Company (Parent) and Ingenuity Corporation (Spinco), their respective parent groups, and other related entities. The agreement governs the transfer of intellectual property assets from Parent to Spinco.

Key Points:

1. **Trade Secrets**: Information that is a trade secret under applicable law will have its five-year confidentiality period extended until such time as the received information is no longer considered trade secret.
2. **Further Assurances**: Each party agrees to take actions necessary to consummate and make effective the transactions contemplated by this agreement, including cooperating with each other and executing or delivering all required instruments and filings.
3. **Term


KeyboardInterrupt



In [74]:
!jupyter nbconvert --to html Capstone_w3w4_VK.ipynb

[NbConvertApp] Converting notebook Capstone_w3w4_VK.ipynb to html
[NbConvertApp] Writing 370588 bytes to Capstone_w3w4_VK.html
