In [1]:
import os
import re
import tabula
import pdfplumber
import pytesseract
from PIL import Image
import io
import pandas as pd
import spacy
import nltk
import string
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Load Spacy model for Named Entity Recognition
nlp = spacy.load('en_core_web_sm')

# Function for text normalization
def text_normalization(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to lemmatize tokens
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    pos_tagged = pos_tag(tokens)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tagged]
    return lemmatized_words

# Function to get WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Function for Named Entity Recognition (NER) using Spacy
def named_entity_recognition(text):
    doc = nlp(text)
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    return entities

# Function to extract tables using Tabula-py and capture context around the table
def extract_tables_and_context(pdf_path, page_text, page_num, previous_page_text=None, lines_above=3, lines_below=3):
    tables = tabula.read_pdf(pdf_path, pages=page_num, multiple_tables=True)
    table_list = []
    all_lines = page_text.splitlines()

    if tables:
        for i, table in enumerate(tables):
            if i == 0 and previous_page_text:
                previous_page_lines = previous_page_text.splitlines()
                context_above = "\n".join(previous_page_lines[-lines_above:])
            else:
                context_above = "\n".join(all_lines[max(0, i - lines_above):i])
            context_below = "\n".join(all_lines[i + len(table):i + len(table) + lines_below])
            table_list.append({
                "table_number": i + 1,
                "table_data": table,
                "context_above": context_above,
                "context_below": context_below
            })
    return table_list

# Function to process PDF files and extract tables with context
def process_files(pdf_directory):
    file_names = [f for f in os.listdir(pdf_directory) if f.lower().endswith('.pdf')]
    all_preprocessed_data = []

    for file_name in file_names:
        pdf_path = os.path.join(pdf_directory, file_name)
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    page_text = page.extract_text()
                    if page_text:
                        table_context_data = extract_tables_and_context(pdf_path, page_text, page_num + 1)
                        normalized_text = text_normalization(page_text)
                        words = word_tokenize(normalized_text)
                        words = remove_stopwords(words)
                        lemmatized_words = lemmatize_tokens(words)
                        named_entities = named_entity_recognition(normalized_text)
                        all_preprocessed_data.append({
                            "file_name": file_name,
                            "normalized_text": normalized_text,
                            "lemmatized_words": lemmatized_words,
                            "named_entities": named_entities,
                            "table_context_data": table_context_data
                        })
        except Exception as e:
            print(f"Error processing file {file_name}: {e}")
    return all_preprocessed_data


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
import numpy as np

# Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFAutoModel.from_pretrained("bert-base-uncased")

# Function to generate document embeddings
def get_document_embedding(normalized_text, named_entities, table_context_data):
    entity_text = " ".join([entity[0] for entity in named_entities])
    table_text = " ".join([table["context_above"] + " " + table["context_below"] for page in table_context_data for table in page["tables"]])
    combined_text = normalized_text + " " + entity_text + " " + table_text
    inputs = tokenizer(combined_text, return_tensors="tf", padding=True, truncation=True)
    outputs = bert_model(inputs['input_ids'])
    embedding = tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy()
    return embedding


In [None]:
import faiss
from sklearn.cluster import KMeans

# Function to initialize FAISS and perform clustering
def prepare_faiss_and_clusters(all_preprocessed_data, num_clusters):
    all_document_embeddings = []
    for data in all_preprocessed_data:
        embedding = get_document_embedding(
            data['normalized_text'],
            data['named_entities'],
            data['table_context_data']
        )
        all_document_embeddings.append(embedding)

    all_document_embeddings = np.array(all_document_embeddings)
    
    # Initialize FAISS index and add embeddings
    embedding_dimension = all_document_embeddings.shape[1]
    faiss_index = faiss.IndexFlatL2(embedding_dimension)
    faiss_index.add(all_document_embeddings)

    # Perform K-Means clustering
    kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans_model.fit_predict(all_document_embeddings)

    return faiss_index, all_document_embeddings, clusters, kmeans_model


In [None]:
import requests
import json

# Function for BERT-based extractive summarization
def bert_extractive_summary(normalized_text, named_entities, max_sentences=3):
    sentences = sent_tokenize(normalized_text)
    sentence_embeddings = get_bert_embeddings(sentences)
    document_embedding = get_document_embedding(normalized_text, named_entities, [])
    similarity_scores = cosine_similarity(sentence_embeddings, document_embedding.reshape(1, -1)).flatten()
    ranked_sentences = [sentences[i] for i in np.argsort(similarity_scores)[::-1]]
    return " ".join(ranked_sentences[:max_sentences])

# Function for LLaMA-based abstractive summarization
def generate_llama_summary_with_context(normalized_text, named_entities, table_context_data):
    entity_text = " ".join([entity[0] for entity in named_entities])
    table_text = " ".join([table["context_above"] + " " + table["context_below"] for page in table_context_data for table in page["tables"]])
    combined_text = normalized_text + "\n\nNamed Entities: " + entity_text + "\n\nTable Context: " + table_text
    
    headers = {'Content-Type': 'application/json'}
    data = {"model": "llama3.1", "prompt": f"Summarize the following text:\n\n{combined_text}\n\nAbstract Summary:"}
    
    response = requests.post("http://127.0.0.1:11434/api/generate", headers=headers, data=json.dumps(data), stream=True)
    final_summary = ""
    
    for line in response.iter_lines():
        if line:
            try:
                data = json.loads(line.decode('utf-8'))
                final_summary += data.get("response", "")
                if data.get("done", False):
                    break
            except json.JSONDecodeError as e:
                continue

    return final_summary.strip()


In [None]:
def dynamic_summary_mode(all_preprocessed_data, faiss_index, clusters, kmeans_model, mode="query", query=None, top_k=5):
    if mode == "query" and query:
        query_embedding, _ = preprocess_query(query)
        
        # Predict the cluster for the query
        query_cluster = kmeans_model.predict([query_embedding])[0]
        print(f"Query belongs to cluster: {query_cluster}")
        
        # Get relevant documents from the predicted cluster
        relevant_indices = [i for i, cluster in enumerate(clusters) if cluster == query_cluster]
        
        # Create cluster-specific FAISS index for fast search
        cluster_embeddings = np.array([get_document_embedding(all_preprocessed_data[i]['normalized_text'], all_preprocessed_data[i]['named_entities'], all_preprocessed_data[i]['table_context_data']) for i in relevant_indices])
        cluster_faiss_index = initialize_faiss_index(embedding_dimension=768)
        add_embeddings_to_faiss(cluster_faiss_index, cluster_embeddings)
        
        # Perform FAISS search within the cluster to find the top-K similar documents
        distances, cluster_specific_indices = search_faiss(cluster_faiss_index, query_embedding, top_k=top_k)

        # Retrieve and display the top-K results, ranked by similarity (smallest distance = most similar)
        print("\nTop documents ranked by similarity to the query:")
        for rank, i in enumerate(cluster_specific_indices[0]):
            original_index = relevant_indices[i]
            doc = all_preprocessed_data[original_index]
            distance = distances[0][rank]  # Get the corresponding distance for the document
            
            print(f"Rank {rank+1}, Document: {doc['file_name']}, Distance: {distance}")
            abstractive_summary = generate_llama_summary_with_context(doc['normalized_text'], doc['named_entities'], doc['table_context_data'])
            print(f"Abstractive Summary: {abstractive_summary}\n")
    
    elif mode == "full":
        for doc in all_preprocessed_data:
            abstractive_summary = generate_llama_summary_with_context(doc['normalized_text'], doc['named_entities'], doc['table_context_data'])
            print(f"Document: {doc['file_name']}\nAbstractive Summary: {abstractive_summary}\n")


In [None]:
# Preprocess the files
pdf_directory = "path/to/pdf/documents"
all_preprocessed_data = process_files(pdf_directory)

# Prepare FAISS index and K-Means clusters
num_clusters = 2
faiss_index, all_document_embeddings, clusters, kmeans_model = prepare_faiss_and_clusters(all_preprocessed_data, num_clusters)

# Run query-based summarization
query = "Patent infringement in intellectual property law"
dynamic_summary_mode(all_preprocessed_data, faiss_index, clusters, kmeans_model, mode="query", query=query, top_k=5)

# Run full-document summarization
dynamic_summary_mode(all_preprocessed_data, faiss_index, clusters, kmeans_model, mode="full")
