## Basic PDF Questioning

In [2]:
import fitz  # PyMuPDF
from transformers import pipeline

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    return text

# Function to answer questions from the extracted text
def answer_question_from_text(text, question):
    # Load a pre-trained QA model from Hugging Face
    qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

    # Use the QA model to find the answer
    answer = qa_pipeline(question=question, context=text)
    return answer['answer']

# Usage example
pdf_path = "D://Cross Search Automation//Previous Cross//VL63425 Spec Sheet.pdf"  # Replace with the path to your PDF file
text = extract_text_from_pdf(pdf_path)

question = "Total Lumens"  # Replace with your question
answer = answer_question_from_text(text, question)

print(f"Answer: {answer}")

Answer: 1270lm
Delivered Lumens


## Improvements:

But the following code is not providing the desirable results for the second pdf.

In [8]:
import fitz  # PyMuPDF
from transformers import pipeline
from difflib import SequenceMatcher

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    return text

# Function to answer questions from the extracted text
def answer_question_from_text(text, question):
    # Load a pre-trained QA model from Hugging Face
    qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

    # Use the QA model to find the answer
    answer = qa_pipeline(question=question, context=text)
    return answer['answer']

# Function to compare two texts and return similarities/differences
def compare_texts(text1, text2):
    # Use SequenceMatcher to compute a similarity ratio
    similarity_ratio = SequenceMatcher(None, text1, text2).ratio()
    return similarity_ratio

# Main function to handle the comparison and QA
if __name__ == "__main__":
    # Paths to your PDF files
    pdf_path1 = 'Linear Emergency Egress Light 1.pdf'   # Replace with the path to the first PDF file
    pdf_path2 = 'ol2 mullion mount.pdf'  # Replace with the path to the second PDF file

    # Extract text from both PDFs
    text1 = extract_text_from_pdf(pdf_path1)
    text2 = extract_text_from_pdf(pdf_path2)

    # Ask a question to one of the PDFs (you can choose which one)
    question = "What is the warranty duration?"  # Replace with your question
    answer_from_pdf1 = answer_question_from_text(text1, question)
    answer_from_pdf2 = answer_question_from_text(text2, question)

    print(f"Answer from PDF 1: {answer_from_pdf1}")
    print(f"Answer from PDF 2: {answer_from_pdf2}")

    # Compare texts from both PDFs
    similarity = compare_texts(text1, text2)
    print(f"Similarity Ratio between PDF 1 and PDF 2: {similarity:.2f}")

    # Optional: Output differences or other comparisons as needed

Answer from PDF 1: 10 year
Answer from PDF 2: 120 minute
Similarity Ratio between PDF 1 and PDF 2: 0.02


Updated code, in leu of improvement.

In [11]:
import fitz  # PyMuPDF
from transformers import pipeline
from difflib import SequenceMatcher
from pdfminer.high_level import extract_text
import pytesseract
from PIL import Image

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    return text

# Function to extract text using OCR
def extract_text_from_image(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        text += pytesseract.image_to_string(img)
    return text

# Function to preprocess text (example: remove extra spaces)
def preprocess_text(text):
    text = ' '.join(text.split())
    return text

# Function to answer questions from the extracted text
def answer_question_from_text(text, question):
    # Load a pre-trained QA model from Hugging Face
    qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

    # Use the QA model to find the answer
    answer = qa_pipeline(question=question, context=text)
    return answer['answer']

# Function to compare two texts and return similarities/differences
def compare_texts(text1, text2):
    # Use SequenceMatcher to compute a similarity ratio
    similarity_ratio = SequenceMatcher(None, text1, text2).ratio()
    return similarity_ratio

# Main function to handle the comparison and QA
if __name__ == "__main__":
    # Paths to your PDF files
    pdf_path1 = 'Linear Emergency Egress Light 1.pdf'   # Replace with the path to the first PDF file
    pdf_path2 = 'ol2 mullion mount.pdf'  # Replace with the path to the second PDF file

    # Extract text from both PDFs (consider using OCR if PDFs are scanned)
    text1 = extract_text_from_pdf(pdf_path1)
    text2 = extract_text_from_pdf(pdf_path2)

    # Optionally, if text extraction is not accurate, use OCR
    # text2 = extract_text_from_image(pdf_path2)

    # Preprocess texts
    text1 = preprocess_text(text1)
    text2 = preprocess_text(text2)

    # Ask a question to one of the PDFs (you can choose which one)
    question = "What is the warranty duration?"  # Replace with your question
    answer_from_pdf1 = answer_question_from_text(text1, question)
    answer_from_pdf2 = answer_question_from_text(text2, question)

    print(f"Answer from PDF 1: {answer_from_pdf1}")
    print(f"Answer from PDF 2: {answer_from_pdf2}")

    # Compare texts from both PDFs
    similarity = compare_texts(text1, text2)
    print(f"Similarity Ratio between PDF 1 and PDF 2: {similarity:.2f}")

    # Optional: Output differences or other comparisons as needed

Answer from PDF 1: 10 year
Answer from PDF 2: 120 minute
Similarity Ratio between PDF 1 and PDF 2: 0.02


The code is still not working as per requirements. Another attempt is made below.

In [2]:
import fitz  # PyMuPDF
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    return text

# Function to preprocess text (example: remove extra spaces and newlines)
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Function to answer questions from the extracted text
def answer_question_from_text(text, question):
    # Load a pre-trained QA model from Hugging Face
    qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

    # Use the QA model to find the answer
    answer = qa_pipeline(question=question, context=text)
    return answer['answer']

# Function to compare two texts and return similarity using cosine similarity
def compare_texts(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity([vectors[0]], [vectors[1]])[0][0]
    return similarity

# Main function to handle the comparison and QA
if __name__ == "__main__":
    # Paths to your PDF files
    pdf_path1 = "D://Cross Search Automation//Previous Cross//IKIO Lights//Delphi_PL_2x4FT_504030W_504035K_DLC_TDS.pdf"   # Replace with the path to the first PDF file
    pdf_path2 = "D://Cross Search Automation//Previous Cross//IKIO Lights//Delphi_PL_2x2FT_403020W_504035K_DLC_TDS.pdf"  # Replace with the path to the second PDF file

    # Extract text from both PDFs
    text1 = extract_text_from_pdf(pdf_path1)
    text2 = extract_text_from_pdf(pdf_path2)

    # Preprocess texts
    text1 = preprocess_text(text1)
    text2 = preprocess_text(text2)

    # Ask a question to one of the PDFs (you can choose which one)
    question = "Wattage?"  # Replace with your question
    answer_from_pdf1 = answer_question_from_text(text1, question)
    answer_from_pdf2 = answer_question_from_text(text2, question)

    print(f"Answer from PDF 1: {answer_from_pdf1}")
    print(f"Answer from PDF 2: {answer_from_pdf2}")

    # Compare texts from both PDFs using cosine similarity
    similarity = compare_texts(text1, text2)
    print(f"Cosine Similarity between PDF 1 and PDF 2: {similarity:.2f}")

Answer from PDF 1: 662187549422
Answer from PDF 2: +1 844-533-4546
Cosine Similarity between PDF 1 and PDF 2: 0.90


## Backbone 1: Now, an attempt to find similar pdf in folder based on text

In [1]:
import fitz  # PyMuPDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import os

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    return text

# Function to preprocess text (example: remove extra spaces and newlines)
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Function to compare two texts and return similarity using cosine similarity
def compare_texts(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity([vectors[0]], [vectors[1]])[0][0]
    return similarity

# Function to find the most similar PDF in a folder
def find_most_similar_pdf(input_pdf_path, folder_path):
    # Extract text from the input PDF
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))

    # Initialize variables to track the most similar PDF
    max_similarity = -1
    most_similar_pdf = None

    # Iterate over each PDF in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            # Extract text from the current PDF
            folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
            # Compute similarity
            similarity = compare_texts(input_text, folder_pdf_text)
            # Update most similar PDF if needed
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar_pdf = filename

    return most_similar_pdf, max_similarity

# Main function to handle the PDF similarity search
if __name__ == "__main__":
    # Path to the input PDF and folder containing other PDFs
    input_pdf_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//lbk-configurable.pdf"  # Replace with the path to the input PDF
    folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Replace with the path to the folder containing PDFs

    # Find the most similar PDF
    most_similar_pdf, similarity = find_most_similar_pdf(input_pdf_path, folder_path)
    
    if most_similar_pdf:
        print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of {similarity:.2f}")
    else:
        print("No similar PDF found.")

The most similar PDF is: Delphi_FPCL_PS.pdf with a similarity score of 0.30


In [52]:
from sentence_transformers import SentenceTransformer, util
import fitz  # PyMuPDF
import os
import re

# Initialize the Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Use a lightweight model for fast performance

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    return text

# Function to preprocess text (example: remove extra spaces and newlines)
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Function to find the most similar PDF in a folder using Sentence-BERT embeddings
def find_most_similar_pdf(input_pdf_path, folder_path):
    # Extract and preprocess text from the input PDF
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))
    input_embedding = model.encode(input_text, convert_to_tensor=True)

    # Initialize variables to track the most similar PDF
    max_similarity = -1
    most_similar_pdf = None

    # Iterate over each PDF in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            # Extract and preprocess text from the current PDF
            folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
            folder_pdf_embedding = model.encode(folder_pdf_text, convert_to_tensor=True)
            
            # Compute cosine similarity using Sentence-BERT
            similarity = util.pytorch_cos_sim(input_embedding, folder_pdf_embedding).item()

            # Update most similar PDF if needed
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar_pdf = filename

    return most_similar_pdf, max_similarity

# Main function to handle the PDF similarity search
if __name__ == "__main__":
    # Path to the input PDF and folder containing other PDFs
    input_pdf_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//configurable-cpx.pdf"  # Replace with the path to the input PDF
    folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Replace with the path to the folder containing PDFs

    # Find the most similar PDF
    most_similar_pdf, similarity = find_most_similar_pdf(input_pdf_path, folder_path)
    
    if most_similar_pdf:
        print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of {similarity:.2f}")
    else:
        print("No similar PDF found.")

The most similar PDF is: Delphi_BLPL_PS.pdf with a similarity score of 0.57


In [51]:
from transformers import BertTokenizer, BertModel
import torch
import fitz  # PyMuPDF
import os
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Initialize the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    return text

# Function to preprocess text (example: remove extra spaces and newlines)
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Function to encode text using BERT model
def encode_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token's embedding as the sentence representation
    return outputs.last_hidden_state[:, 0, :].numpy()

# Function to find the most similar PDF in a folder using BERT embeddings
def find_most_similar_pdf(input_pdf_path, folder_path):
    # Extract and preprocess text from the input PDF
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))
    input_embedding = encode_text(input_text)

    # Initialize variables to track the most similar PDF
    max_similarity = -1
    most_similar_pdf = None

    # Preprocess all PDFs in the folder and batch encode
    pdf_embeddings = {}
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    for filename in pdf_files:
        pdf_path = os.path.join(folder_path, filename)
        folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
        pdf_embeddings[filename] = encode_text(folder_pdf_text)

    # Compute cosine similarity for each PDF against the input PDF
    for filename, folder_pdf_embedding in pdf_embeddings.items():
        similarity = cosine_similarity(input_embedding, folder_pdf_embedding).item()
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_pdf = filename

    return most_similar_pdf, max_similarity

# Main function to handle the PDF similarity search
if __name__ == "__main__":
    # Path to the input PDF and folder containing other PDFs
    input_pdf_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//lbk-configurable.pdf"  # Replace with the path to the input PDF
    folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Replace with the path to the folder containing PDFs

    # Find the most similar PDF
    most_similar_pdf, similarity = find_most_similar_pdf(input_pdf_path, folder_path)
    
    if most_similar_pdf:
        print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of {similarity:.2f}")
    else:
        print("No similar PDF found.")

The most similar PDF is: Sigma_EMTUBE_TypeB_PS.pdf with a similarity score of 0.89


In [50]:
from transformers import BertTokenizer, BertModel, pipeline
from sentence_transformers import SentenceTransformer
import torch
import fitz  # PyMuPDF
import os
import re
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# Initialize the BERT model for NER and Sentence-BERT for embeddings
ner_model = pipeline("ner", model="dslim/bert-base-NER")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text_blocks = []
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text_blocks.extend(page.get_text("blocks"))  # Extract text by blocks (headings, paragraphs, etc.)
    return text_blocks

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = text.strip()
    return text

# Function to categorize and extract key entities from text
def extract_key_sections(text_blocks):
    sections = {'headlines': [], 'specifications': []}
    for block in text_blocks:
        text = block[4]
        text = preprocess_text(text)
        # Using heuristic: short and uppercase texts as headlines
        if len(text.split()) <= 10 and text.isupper():
            sections['headlines'].append(text)
        elif len(text.split()) > 10:
            # Apply NER for extracting specifications
            entities = ner_model(text)
            specifications = [entity['word'] for entity in entities if entity['entity'].startswith("B-")]  # Get only Beginning of entities
            sections['specifications'].extend(specifications)
    return sections

# Function to encode sections into embeddings
def encode_sections(sections):
    all_text = sections['headlines'] + sections['specifications']
    embeddings = embedding_model.encode(all_text)
    return embeddings

# Function to find the most similar PDF in a folder using enhanced embeddings
def find_most_similar_pdf(input_pdf_path, folder_path):
    # Extract and preprocess text from the input PDF
    input_text_blocks = extract_text_from_pdf(input_pdf_path)
    input_sections = extract_key_sections(input_text_blocks)
    input_embedding = encode_sections(input_sections)

    # Initialize variables to track the most similar PDF
    max_similarity = -1
    most_similar_pdf = None

    # Preprocess all PDFs in the folder and compute embeddings
    pdf_embeddings = {}
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    for filename in pdf_files:
        pdf_path = os.path.join(folder_path, filename)
        folder_pdf_text_blocks = extract_text_from_pdf(pdf_path)
        folder_pdf_sections = extract_key_sections(folder_pdf_text_blocks)
        folder_pdf_embedding = encode_sections(folder_pdf_sections)
        pdf_embeddings[filename] = folder_pdf_embedding

    # Compute cosine similarity for each PDF against the input PDF
    for filename, folder_pdf_embedding in pdf_embeddings.items():
        similarity = cosine_similarity([input_embedding.mean(axis=0)], [folder_pdf_embedding.mean(axis=0)]).item()  # Mean pooling to get document-level similarity
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_pdf = filename

    return most_similar_pdf, max_similarity

# Main function to handle the PDF similarity search
if __name__ == "__main__":
    # Path to the input PDF and folder containing other PDFs
    input_pdf_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//configurable-cpx.pdf"  # Replace with the path to the input PDF
    folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Replace with the path to the folder containing PDFs

    # Find the most similar PDF
    most_similar_pdf, similarity = find_most_similar_pdf(input_pdf_path, folder_path)
    
    if most_similar_pdf:
        print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of {similarity:.2f}")
    else:
        print("No similar PDF found.")




Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The most similar PDF is: Sigma_EMTUBE_TypeB_PS.pdf with a similarity score of 0.87


The final most similar pdf on the basis of text is now here....

In [58]:
from sentence_transformers import SentenceTransformer, util
import fitz  # PyMuPDF
import os
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from concurrent.futures import ThreadPoolExecutor
from nltk import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

# Initialize the Sentence-BERT model once at the start
model = SentenceTransformer('all-MiniLM-L6-v2')

lemmatizer = WordNetLemmatizer()

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page_num in range(doc.page_count):
            text += doc[page_num].get_text()
    return text

# Function to preprocess text
def preprocess_text(text):
    # Remove punctuation, convert to lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    
    # Tokenize and remove stopwords
    tokens = [word for word in word_tokenize(text) if word not in ENGLISH_STOP_WORDS]
    
    # Apply lemmatization
    lemmatized_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    
    return lemmatized_text.strip()

# Function to generate n-grams from text
def generate_ngrams(text, n=2):
    tokens = word_tokenize(text)
    ngrams_list = list(ngrams(tokens, n))
    return [' '.join(ngram) for ngram in ngrams_list]

# Function to compute embeddings
def compute_embedding(text):
    # Split text into smaller chunks for more granular embeddings
    sentences = text.split('. ')
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return embeddings

# Function to compute similarity score between two sets of embeddings
def compute_similarity(embedding1, embedding2):
    # Compute cosine similarity
    cosine_sim = util.cos_sim(embedding1, embedding2)
    return cosine_sim.max().item()  # Use max similarity across chunks

# Function to process a single PDF file and calculate its similarity score
def process_pdf(file_info):
    input_embedding, input_pdf_path, pdf_path = file_info
    folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
    
    # Enrich text with bigrams
    bigrams = generate_ngrams(folder_pdf_text, 2)
    enriched_text = ' '.join([folder_pdf_text] + bigrams)  # Concatenate original text with bigrams
    
    folder_pdf_embedding = compute_embedding(enriched_text)
    
    # Compute similarity score
    similarity = compute_similarity(input_embedding, folder_pdf_embedding)
    return (pdf_path, similarity)

# Function to find the most similar PDF in a folder using Sentence-BERT embeddings
def find_most_similar_pdf(input_pdf_path, folder_path):
    # Extract and preprocess text from the input PDF
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))
    
    # Enrich text with bigrams
    bigrams = generate_ngrams(input_text, 2)
    enriched_text = ' '.join([input_text] + bigrams)  # Concatenate original text with bigrams
    
    input_embedding = compute_embedding(enriched_text)

    # List all PDF files in the folder
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]

    # Use ThreadPoolExecutor for parallel processing
    max_similarity = -1
    most_similar_pdf = None
    with ThreadPoolExecutor() as executor:
        results = executor.map(process_pdf, [(input_embedding, input_pdf_path, pdf_path) for pdf_path in pdf_files])

    # Process results to find the most similar PDF
    for pdf_path, similarity in results:
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_pdf = os.path.basename(pdf_path)

    return most_similar_pdf, max_similarity

# Main function to handle the PDF similarity search
if __name__ == "__main__":
    # Path to the input PDF and folder containing other PDFs
    input_pdf_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//lbk-configurable.pdf"  # Replace with the path to the input PDF
    folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Replace with the path to the folder containing PDFs

    # Find the most similar PDF
    most_similar_pdf, similarity = find_most_similar_pdf(input_pdf_path, folder_path)

    if most_similar_pdf:
        print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of {similarity:.2f}")
    else:
        print("No similar PDF found.")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The most similar PDF is: T8_Tube_Type_C_PS.pdf with a similarity score of 0.82


The results are really appreciable!!!!

## Searching Image in PDF

In [5]:
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
from torchvision import models, transforms
import os
import hashlib

# Initialize image model and transform
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet50(pretrained=True).to(device)
model.eval()
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to compute image hash
def hash_image(image_path):
    with Image.open(image_path) as img:
        hash_value = hashlib.md5(img.tobytes()).hexdigest()
    return hash_value

# Function to extract image features
def extract_image_features(image_path):
    image = Image.open(image_path)
    image = preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        features = model(image)
    return features.cpu().numpy().flatten()

# Function to extract images from a PDF and save them
def extract_images_from_pdf(pdf_path, image_folder):
    doc = fitz.open(pdf_path)
    image_paths = []
    for i, page in enumerate(doc):
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_data = base_image["image"]
            image_path = os.path.join(image_folder, f"page_{i}_img_{img_index}.png")
            with open(image_path, "wb") as f:
                f.write(image_data)
            image_paths.append(image_path)
    return image_paths

# Function to find the PDF with an image similar to or exactly the same as the input image
def find_pdf_with_similar_image(input_image_path, folder_path, image_folder, threshold=0.9):
    input_image_hash = hash_image(input_image_path)
    input_features = extract_image_features(input_image_path)
    
    most_similar_pdf = None
    max_similarity = -1
    exact_match_pdf = None

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            pdf_images = extract_images_from_pdf(pdf_path, image_folder)

            for pdf_image in pdf_images:
                pdf_image_hash = hash_image(pdf_image)
                if pdf_image_hash == input_image_hash:
                    exact_match_pdf = filename
                    break

                pdf_image_features = extract_image_features(pdf_image)
                similarity = cosine_similarity([input_features], [pdf_image_features])[0][0]
                if similarity > max_similarity:
                    max_similarity = similarity
                    most_similar_pdf = filename
            
            if exact_match_pdf:
                break
    
    return exact_match_pdf, most_similar_pdf, max_similarity

# Main function to handle the image similarity search
if __name__ == "__main__":
    # Paths to your input image and folder containing other PDFs
    input_image_path = "D://Cross Search Automation//Previous Cross//Capture.JPG"  # Replace with the path to the input image
    folder_path = "D://Cross Search Automation//Previous Cross"  # Replace with the path to the folder containing PDFs
    image_folder = "D://Cross Search Automation//Previous Cross"  # Replace with the path to a folder for saving extracted images

    # Find the PDF with an image similar to or exactly the same as the input image
    exact_match_pdf, most_similar_pdf, similarity = find_pdf_with_similar_image(input_image_path, folder_path, image_folder)
    
    if exact_match_pdf:
        print(f"Exact match found in PDF: {exact_match_pdf}")
    else:
        print(f"No exact match found. Most similar PDF is: {most_similar_pdf} with a similarity score of {similarity:.2f}")

NameError: name 'cosine_similarity' is not defined

RGB error

In [7]:
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
from torchvision import models, transforms
import os
import hashlib
from sklearn.metrics.pairwise import cosine_similarity  # Import for cosine similarity

# Initialize image model and transform
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet50(pretrained=True).to(device)
model.eval()
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to compute image hash
def hash_image(image_path):
    with Image.open(image_path) as img:
        hash_value = hashlib.md5(img.tobytes()).hexdigest()
    return hash_value

# Function to convert image to RGB if it's not
def convert_to_rgb(image):
    if image.mode != 'RGB':
        image = image.convert('RGB')
    return image

# Function to extract image features
def extract_image_features(image_path):
    image = Image.open(image_path)
    image = convert_to_rgb(image)  # Ensure image is in RGB format
    image = preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        features = model(image)
    return features.cpu().numpy().flatten()

# Function to extract images from a PDF and save them
def extract_images_from_pdf(pdf_path, image_folder):
    doc = fitz.open(pdf_path)
    image_paths = []
    for i, page in enumerate(doc):
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_data = base_image["image"]
            image_path = os.path.join(image_folder, f"page_{i}_img_{img_index}.png")
            with open(image_path, "wb") as f:
                f.write(image_data)
            image_paths.append(image_path)
    return image_paths

# Function to find the PDF with an image similar to or exactly the same as the input image
def find_pdf_with_similar_image(input_image_path, folder_path, image_folder, threshold=0.9):
    input_image_hash = hash_image(input_image_path)
    input_features = extract_image_features(input_image_path)
    
    most_similar_pdf = None
    max_similarity = -1
    exact_match_pdf = None

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            pdf_images = extract_images_from_pdf(pdf_path, image_folder)

            for pdf_image in pdf_images:
                pdf_image_hash = hash_image(pdf_image)
                if pdf_image_hash == input_image_hash:
                    exact_match_pdf = filename
                    break

                pdf_image_features = extract_image_features(pdf_image)
                similarity = cosine_similarity([input_features], [pdf_image_features])[0][0]
                if similarity > max_similarity:
                    max_similarity = similarity
                    most_similar_pdf = filename
            
            if exact_match_pdf:
                break
    
    return exact_match_pdf, most_similar_pdf, max_similarity

# Main function to handle the image similarity search
if __name__ == "__main__":
    # Paths to your input image and folder containing other PDFs
    input_image_path = "D://Cross Search Automation//Previous Cross//Capture.JPG"  # Replace with the path to the input image
    folder_path = "D://Cross Search Automation//Previous Cross"  # Replace with the path to the folder containing PDFs
    image_folder = "D://Cross Search Automation//Previous Cross"  # Replace with the path to a folder for saving extracted images

    # Find the PDF with an image similar to or exactly the same as the input image
    exact_match_pdf, most_similar_pdf, similarity = find_pdf_with_similar_image(input_image_path, folder_path, image_folder)
    
    if exact_match_pdf:
        print(f"Exact match found in PDF: {exact_match_pdf}")
    else:
        print(f"Most similar PDF is: {most_similar_pdf} with a similarity score of {similarity:.2f}")



Most similar PDF is: Spec for LED Panel Light with Back-lite[1].pdf with a similarity score of 0.86


## Now editing to check the image similarities

### Test Run 1

In [3]:
import fitz  # PyMuPDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import os
from PIL import Image
import numpy as np
import torch
from torchvision import models, transforms

# Initialize image model and transform
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet50(pretrained=True).to(device)
model.eval()
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    return text

# Function to preprocess text (example: remove extra spaces and newlines)
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Function to compare two texts and return similarity using cosine similarity
def compare_texts(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity([vectors[0]], [vectors[1]])[0][0]
    return similarity

# Function to extract image features
def extract_image_features(image_path):
    image = Image.open(image_path)
    image = preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        features = model(image)
    return features.cpu().numpy().flatten()

# Function to extract images from a PDF and save them
def extract_images_from_pdf(pdf_path, image_folder):
    doc = fitz.open(pdf_path)
    image_paths = []
    for i, page in enumerate(doc):
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_data = base_image["image"]
            image_path = os.path.join(image_folder, f"page_{i}_img_{img_index}.png")
            with open(image_path, "wb") as f:
                f.write(image_data)
            image_paths.append(image_path)
    return image_paths

# Function to find the most similar PDF based on image similarity
def find_most_similar_pdf_image(input_image_path, folder_path, image_folder):
    input_features = extract_image_features(input_image_path)
    
    # Initialize variables to track the most similar PDF
    max_similarity = -1
    most_similar_pdf = None
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            
            # Extract images from the PDF
            pdf_images = extract_images_from_pdf(pdf_path, image_folder)
            
            for pdf_image in pdf_images:
                pdf_image_features = extract_image_features(pdf_image)
                similarity = cosine_similarity([input_features], [pdf_image_features])[0][0]
                if similarity > max_similarity:
                    max_similarity = similarity
                    most_similar_pdf = filename

    return most_similar_pdf, max_similarity

# Function to find the most similar PDF based on text similarity
def find_most_similar_pdf_text(input_pdf_path, folder_path):
    # Extract text from the input PDF
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))
    
    # Initialize variables to track the most similar PDF
    max_similarity = -1
    most_similar_pdf = None
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
            similarity = compare_texts(input_text, folder_pdf_text)
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar_pdf = filename

    return most_similar_pdf, max_similarity

# Main function to handle the PDF similarity search
if __name__ == "__main__":
    # Paths to your input files and folder containing other PDFs
    input_pdf_path = "D://Cross Search Automation//Previous Cross//ol2 mullion mount.pdf"  # Replace with the path to the input PDF
    input_image_path = "D://Cross Search Automation//Previous Cross//Capture.JPG"  # Replace with the path to the input image
    folder_path = "D://Cross Search Automation//Previous Cross//Test1"  # Replace with the path to the folder containing PDFs
    image_folder = "D://Cross Search Automation//Previous Cross//save"  # Replace with the path to a folder for saving extracted images

    # Find the most similar PDF based on text
    most_similar_pdf_text, text_similarity = find_most_similar_pdf_text(input_pdf_path, folder_path)
    
    # Find the most similar PDF based on image
    most_similar_pdf_image, image_similarity = find_most_similar_pdf_image(input_image_path, folder_path, image_folder)
    
    # Print results
    print(f"Most similar PDF based on text: {most_similar_pdf_text} with a similarity score of {text_similarity:.2f}")
    print(f"Most similar PDF based on image: {most_similar_pdf_image} with a similarity score of {image_similarity:.2f}")



Most similar PDF based on text: mount.pdf with a similarity score of 1.00
Most similar PDF based on image: mount.pdf with a similarity score of 0.72


### Test Run 2

In [6]:
import fitz  # PyMuPDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import os
from PIL import Image
import numpy as np
import torch
from torchvision.models import resnet50, ResNet50_Weights

# Initialize image model and transform
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weights = ResNet50_Weights.IMAGENET1K_V1  # Use specific weights or ResNet50_Weights.DEFAULT for the latest
model = resnet50(weights=weights).to(device)
model.eval()
preprocess = weights.transforms()  # Use the recommended transforms for the weights

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    return text

# Function to preprocess text (example: remove extra spaces and newlines)
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Function to compare two texts and return similarity using cosine similarity
def compare_texts(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity([vectors[0]], [vectors[1]])[0][0]
    return similarity

# Function to extract image features
def extract_image_features(image_path):
    image = Image.open(image_path)
    image = preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        features = model(image)
    return features.cpu().numpy().flatten()

# Function to extract images from a PDF and save them
def extract_images_from_pdf(pdf_path, image_folder):
    doc = fitz.open(pdf_path)
    image_paths = []
    for i, page in enumerate(doc):
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_data = base_image["image"]
            image_path = os.path.join(image_folder, f"page_{i}_img_{img_index}.png")
            with open(image_path, "wb") as f:
                f.write(image_data)
            image_paths.append(image_path)
    return image_paths

# Function to find the most similar PDF based on image similarity
def find_most_similar_pdf_image(input_image_path, folder_path, image_folder):
    input_features = extract_image_features(input_image_path)
    
    # Initialize variables to track the most similar PDF
    max_similarity = -1
    most_similar_pdf = None
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            
            # Extract images from the PDF
            pdf_images = extract_images_from_pdf(pdf_path, image_folder)
            
            for pdf_image in pdf_images:
                pdf_image_features = extract_image_features(pdf_image)
                similarity = cosine_similarity([input_features], [pdf_image_features])[0][0]
                if similarity > max_similarity:
                    max_similarity = similarity
                    most_similar_pdf = filename

    return most_similar_pdf, max_similarity

# Function to find the most similar PDF based on text similarity
def find_most_similar_pdf_text(input_pdf_path, folder_path):
    # Extract text from the input PDF
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))
    
    # Initialize variables to track the most similar PDF
    max_similarity = -1
    most_similar_pdf = None
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
            similarity = compare_texts(input_text, folder_pdf_text)
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar_pdf = filename

    return most_similar_pdf, max_similarity

# Main function to handle the PDF similarity search
if __name__ == "__main__":
    # Paths to your input files and folder containing other PDFs
    input_pdf_path = "D://Cross Search Automation//Previous Cross//Linear Emergency Egress Light 1.pdf"  # Replace with the path to the input PDF
    input_image_path = "D://Cross Search Automation//Previous Cross//Capture2.JPG"  # Replace with the path to the input image
    folder_path = "D://Cross Search Automation//Previous Cross//Test1"  # Replace with the path to the folder containing PDFs
    image_folder = "D://Cross Search Automation//Previous Cross//save"  # Replace with the path to a folder for saving extracted images

    # Find the most similar PDF based on text
    most_similar_pdf_text, text_similarity = find_most_similar_pdf_text(input_pdf_path, folder_path)
    
    # Find the most similar PDF based on image
    most_similar_pdf_image, image_similarity = find_most_similar_pdf_image(input_image_path, folder_path, image_folder)
    
    # Print results
    print(f"Most similar PDF based on text: {most_similar_pdf_text} with a similarity score of {text_similarity:.2f}")
    print(f"Most similar PDF based on image: {most_similar_pdf_image} with a similarity score of {image_similarity:.2f}")

Most similar PDF based on text: Egress.pdf with a similarity score of 1.00
Most similar PDF based on image: Egress.pdf with a similarity score of 0.95


### Test 3 with a slight change in the code.

Adding the following:

1. Ensured Correct Image Processing: The extract_image_features function now converts images to RGB format to ensure they are correctly processed.
2. Normalized Image Features: Feature vectors are normalized before comparison to provide more accurate similarity measurements.
3. Accurate Feature Comparison: The code compares all images extracted from each PDF to find the most similar one.
4. Improved Consistency in Preprocessing: The preprocessing function is now more consistent and uses recommended transforms for the pretrained model.

In [18]:
import fitz  # PyMuPDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import os
from PIL import Image
import numpy as np
import torch
from torchvision.models import resnet50, ResNet50_Weights
from sklearn.preprocessing import normalize

# Initialize image model and transform
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weights = ResNet50_Weights.IMAGENET1K_V1  # Use specific weights or ResNet50_Weights.DEFAULT for the latest
model = resnet50(weights=weights).to(device)
model.eval()
preprocess = weights.transforms()  # Use the recommended transforms for the weights

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    return text

# Function to preprocess text (example: remove extra spaces and newlines)
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Function to compare two texts and return similarity using cosine similarity
def compare_texts(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity([vectors[0]], [vectors[1]])[0][0]
    return similarity

# Function to extract and normalize image features
def extract_image_features(image_path):
    image = Image.open(image_path).convert('RGB')  # Ensure image is in RGB format
    image = preprocess(image).unsqueeze(0).to(device)  # Apply preprocessing transform
    with torch.no_grad():
        features = model(image)
    features = features.cpu().numpy().flatten()
    normalized_features = normalize([features])[0]  # Normalize the features
    return normalized_features

# Function to extract images from a PDF and save them
def extract_images_from_pdf(pdf_path, image_folder):
    doc = fitz.open(pdf_path)
    image_paths = []
    for i, page in enumerate(doc):
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_data = base_image["image"]
            image_path = os.path.join(image_folder, f"page_{i}_img_{img_index}.png")
            with open(image_path, "wb") as f:
                f.write(image_data)
            image_paths.append(image_path)
    return image_paths

# Function to find the most similar PDF based on image similarity
def find_most_similar_pdf_image(input_image_path, folder_path, image_folder):
    input_features = extract_image_features(input_image_path)
    
    # Initialize variables to track the most similar PDF
    max_similarity = -1
    most_similar_pdf = None
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            
            # Extract images from the PDF
            pdf_images = extract_images_from_pdf(pdf_path, image_folder)
            
            for pdf_image in pdf_images:
                pdf_image_features = extract_image_features(pdf_image)
                similarity = cosine_similarity([input_features], [pdf_image_features])[0][0]
                if similarity > max_similarity:
                    max_similarity = similarity
                    most_similar_pdf = filename

    return most_similar_pdf, max_similarity

# Function to find the most similar PDF based on text similarity
def find_most_similar_pdf_text(input_pdf_path, folder_path):
    # Extract text from the input PDF
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))
    
    # Initialize variables to track the most similar PDF
    max_similarity = -1
    most_similar_pdf = None
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
            similarity = compare_texts(input_text, folder_pdf_text)
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar_pdf = filename

    return most_similar_pdf, max_similarity

# Main function to handle the PDF similarity search
if __name__ == "__main__":
    # Paths to your input files and folder containing other PDFs
    input_pdf_path = "D://Cross Search Automation//Previous Cross//ol2 mullion mount.pdf"  # Replace with the path to the input PDF
    input_image_path = "D://Cross Search Automation//Previous Cross//Capture.JPG"  # Replace with the path to the input image
    folder_path = "D://Cross Search Automation//Previous Cross//Test1"  # Replace with the path to the folder containing PDFs
    image_folder = "D://Cross Search Automation//Previous Cross//save"  # Replace with the path to a folder for saving extracted images

    # Find the most similar PDF based on text
    most_similar_pdf_text, text_similarity = find_most_similar_pdf_text(input_pdf_path, folder_path)
    
    # Find the most similar PDF based on image
    most_similar_pdf_image, image_similarity = find_most_similar_pdf_image(input_image_path, folder_path, image_folder)
    
    # Print results
    print(f"Most similar PDF based on text: {most_similar_pdf_text} with a similarity score of {text_similarity:.2f}")
    print(f"Most similar PDF based on image: {most_similar_pdf_image} with a similarity score of {image_similarity:.2f}")

Most similar PDF based on text: mount.pdf with a similarity score of 1.00
Most similar PDF based on image: Egress.pdf with a similarity score of 0.73


### The results of the most similar PDF search based on image is not accurate, so we will try and run the code for that functionality separately and then try and integrate it.

## Backbone 2: Most similar PDF search based on image

In [17]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import cv2

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            images.append(np.array(image))

    doc.close()
    return images

def calculate_image_similarity(img1, img2):
    """Calculates the similarity between two images using histogram comparison."""
    # Convert images to BGR format if they are not already
    if len(img1.shape) == 2:  # Grayscale image
        img1 = cv2.cvtColor(img1, cv2.COLOR_GRAY2BGR)
    if len(img2.shape) == 2:  # Grayscale image
        img2 = cv2.cvtColor(img2, cv2.COLOR_GRAY2BGR)
        
    # Convert to grayscale for histogram comparison
    img1_gray = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
    img2_gray = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)

    # Calculate histogram and similarity
    hist1 = cv2.calcHist([img1_gray], [0], None, [256], [0, 256])
    hist2 = cv2.calcHist([img2_gray], [0], None, [256], [0, 256])
    
    similarity = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
    return similarity

def find_most_similar_pdf(input_image_path, folder_path):
    """Finds the PDF with the most similar image to the input image."""
    input_image = cv2.imread(input_image_path)
    most_similar_pdf = None
    highest_similarity = -1

    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            extracted_images = extract_images_from_pdf(pdf_path)

            for img in extracted_images:
                similarity = calculate_image_similarity(input_image, img)

                if similarity > highest_similarity:
                    highest_similarity = similarity
                    most_similar_pdf = pdf_file

    return most_similar_pdf, highest_similarity

# Usage example
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//test.png"  # Provide the path to the input image
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Provide the path to the folder with PDFs

most_similar_pdf, similarity_score = find_most_similar_pdf(input_image_path, folder_path)

if most_similar_pdf:
    print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of: {similarity_score:.2f}")
else:
    print("No similar images found in the PDFs.")

The most similar PDF is: Sigma_EMTUBE_TypeB_PS.pdf with a similarity score of: 1.00


But this result is faulty.

In [42]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained deep learning model (ResNet-50)
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    # Convert PIL Image to PyTorch Tensor
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        # Extract features using ResNet
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def find_most_similar_pdf(input_image_path, folder_path):
    """Finds the PDF with the most similar image to the input image."""
    input_image = Image.open(input_image_path)
    
    # Convert input image to RGB if necessary
    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)
    
    most_similar_pdf = None
    highest_similarity = -1

    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            extracted_images = extract_images_from_pdf(pdf_path)

            for img in extracted_images:
                img_features = extract_image_features(img)
                similarity = calculate_image_similarity(input_image_features, img_features)

                if similarity > highest_similarity:
                    highest_similarity = similarity
                    most_similar_pdf = pdf_file

    return most_similar_pdf, highest_similarity

# Usage example
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//test.png"  # Provide the path to the input image
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Provide the path to the folder with PDFs

most_similar_pdf, similarity_score = find_most_similar_pdf(input_image_path, folder_path)

if most_similar_pdf:
    print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of: {similarity_score:.2f}")
else:
    print("No similar images found in the PDFs.")



The most similar PDF is: T8_Tube_Type_C_PS.pdf with a similarity score of: 0.71


The result needs to be more accurate. Testing with multiple images

In [40]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained deep learning model (ResNet-50)
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    # Convert PIL Image to PyTorch Tensor
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        # Extract features using ResNet
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def find_most_similar_pdfs(input_image_paths, folder_path):
    """Finds the most similar PDFs for multiple input images."""
    results = {}  # Dictionary to store results for each input image

    for input_image_path in input_image_paths:
        input_image = Image.open(input_image_path)
        
        # Convert input image to RGB if necessary
        if input_image.mode != 'RGB':
            input_image = input_image.convert('RGB')

        input_image_features = extract_image_features(input_image)
        
        most_similar_pdf = None
        highest_similarity = -1

        for pdf_file in os.listdir(folder_path):
            if pdf_file.endswith(".pdf"):
                pdf_path = os.path.join(folder_path, pdf_file)
                extracted_images = extract_images_from_pdf(pdf_path)

                for img in extracted_images:
                    img_features = extract_image_features(img)
                    similarity = calculate_image_similarity(input_image_features, img_features)

                    if similarity > highest_similarity:
                        highest_similarity = similarity
                        most_similar_pdf = pdf_file

        # Store the result for the current input image
        results[input_image_path] = (most_similar_pdf, highest_similarity)

    return results

# Usage example
input_image_paths = [
    "D://Cross Search Automation//Previous Cross//Vendor Lights//test.png",  # Provide paths to the input images
    "D://Cross Search Automation//Previous Cross//Vendor Lights//test2.jpg",
    "D://Cross Search Automation//Previous Cross//Vendor Lights//test4.png",
]
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Provide the path to the folder with PDFs

similar_pdfs = find_most_similar_pdfs(input_image_paths, folder_path)

# Print the results
for image_path, (pdf, score) in similar_pdfs.items():
    if pdf:
        print(f"For image '{image_path}', the most similar PDF is: '{pdf}' with a similarity score of: {score:.2f}")
    else:
        print(f"No similar images found in the PDFs for image '{image_path}'.")



For image 'D://Cross Search Automation//Previous Cross//Vendor Lights//test.png', the most similar PDF is: 'T8_Tube_Type_C_PS.pdf' with a similarity score of: 0.71
For image 'D://Cross Search Automation//Previous Cross//Vendor Lights//test2.jpg', the most similar PDF is: 'Coloris_RGBW_ELPL_PS.pdf' with a similarity score of: 0.84
For image 'D://Cross Search Automation//Previous Cross//Vendor Lights//test4.png', the most similar PDF is: 'Delphi_Mini_WP_PS.pdf' with a similarity score of: 0.79


The code still is not able to accurately get all the images right. THE BASE MAIN CODE FOR NOW IS THIS:

In [43]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.feature import ORB, match_descriptors, BRIEF
from skimage.transform import integral_image
from skimage.color import rgb2gray
import cv2

# Load pre-trained deep learning model (ResNet-50)
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    # Convert PIL Image to PyTorch Tensor
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        # Extract features using ResNet
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def match_keypoints(image1, image2):
    """Match keypoints between two images using ORB."""
    orb = ORB(n_keypoints=1000)
    image1_gray = rgb2gray(np.array(image1))
    image2_gray = rgb2gray(np.array(image2))
    
    orb.detect_and_extract(image1_gray)
    keypoints1 = orb.keypoints
    descriptors1 = orb.descriptors
    
    orb.detect_and_extract(image2_gray)
    keypoints2 = orb.keypoints
    descriptors2 = orb.descriptors
    
    matches = match_descriptors(descriptors1, descriptors2, cross_check=True)
    return len(matches)

def find_most_similar_pdf(input_image_path, folder_path):
    """Finds the PDF with the most similar image to the input image."""
    input_image = Image.open(input_image_path)
    
    # Convert input image to RGB if necessary
    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)
    most_similar_pdf = None
    highest_similarity = -1
    best_match_count = 0

    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            extracted_images = extract_images_from_pdf(pdf_path)

            for img in extracted_images:
                img_features = extract_image_features(img)
                similarity = calculate_image_similarity(input_image_features, img_features)
                keypoint_match_count = match_keypoints(input_image, img)

                # Combine similarity and keypoint matches
                combined_score = similarity + (keypoint_match_count / 1000)  # Normalize keypoint matches
                if combined_score > highest_similarity:
                    highest_similarity = combined_score
                    most_similar_pdf = pdf_file

    return most_similar_pdf, highest_similarity

# Usage example
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//test.png"  # Provide the path to the input image
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Provide the path to the folder with PDFs

most_similar_pdf, similarity_score = find_most_similar_pdf(input_image_path, folder_path)

if most_similar_pdf:
    print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of: {similarity_score:.2f}")
else:
    print("No similar images found in the PDFs.")



The most similar PDF is: T8_Tube_Type_C_PS.pdf with a similarity score of: 0.72


The results look a little more better. Considering this to be the final code for the backbone 2. Now we try again for multiple images.

In [60]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.feature import ORB, match_descriptors
from skimage.color import rgb2gray

# Load pre-trained deep learning model (ResNet-50)
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    # Convert PIL Image to PyTorch Tensor
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        # Extract features using ResNet
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def match_keypoints(image1, image2):
    """Match keypoints between two images using ORB."""
    orb = ORB(n_keypoints=1000)
    image1_gray = rgb2gray(np.array(image1))
    image2_gray = rgb2gray(np.array(image2))
    
    orb.detect_and_extract(image1_gray)
    keypoints1 = orb.keypoints
    descriptors1 = orb.descriptors
    
    orb.detect_and_extract(image2_gray)
    keypoints2 = orb.keypoints
    descriptors2 = orb.descriptors
    
    matches = match_descriptors(descriptors1, descriptors2, cross_check=True)
    return len(matches)

def find_most_similar_pdfs(input_image_paths, folder_path):
    """Finds the most similar PDF for each input image."""
    results = {}

    for input_image_path in input_image_paths:
        input_image = Image.open(input_image_path)
        
        # Convert input image to RGB if necessary
        if input_image.mode != 'RGB':
            input_image = input_image.convert('RGB')

        input_image_features = extract_image_features(input_image)
        most_similar_pdf = None
        highest_similarity = -1

        for pdf_file in os.listdir(folder_path):
            if pdf_file.endswith(".pdf"):
                pdf_path = os.path.join(folder_path, pdf_file)
                extracted_images = extract_images_from_pdf(pdf_path)

                for img in extracted_images:
                    img_features = extract_image_features(img)
                    similarity = calculate_image_similarity(input_image_features, img_features)
                    keypoint_match_count = match_keypoints(input_image, img)

                    # Combine similarity and keypoint matches
                    combined_score = similarity + (keypoint_match_count / 1000)  # Normalize keypoint matches
                    if combined_score > highest_similarity:
                        highest_similarity = combined_score
                        most_similar_pdf = pdf_file

        results[input_image_path] = (most_similar_pdf, highest_similarity)

    return results

# Usage example
input_image_paths = [
    "D://Cross Search Automation//Previous Cross//Vendor Lights//test.png",
    "D://Cross Search Automation//Previous Cross//Vendor Lights//test2.jpg",
    "D://Cross Search Automation//Previous Cross//Vendor Lights//test4.png",
    # Add more image paths here
]
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Provide the path to the folder with PDFs

results = find_most_similar_pdfs(input_image_paths, folder_path)

for input_image_path, (most_similar_pdf, similarity_score) in results.items():
    if most_similar_pdf:
        print(f"For input image {input_image_path}, the most similar PDF is: {most_similar_pdf} with a similarity score of: {similarity_score:.2f}")
    else:
        print(f"For input image {input_image_path}, no similar images found in the PDFs.")



For input image D://Cross Search Automation//Previous Cross//Vendor Lights//test.png, the most similar PDF is: T8_Tube_Type_C_PS.pdf with a similarity score of: 0.72
For input image D://Cross Search Automation//Previous Cross//Vendor Lights//test2.jpg, the most similar PDF is: Coloris_RGBW_ELPL_PS.pdf with a similarity score of: 0.85
For input image D://Cross Search Automation//Previous Cross//Vendor Lights//test4.png, the most similar PDF is: Delphi_Mini_WP_PS.pdf with a similarity score of: 0.89


The Database has increased and the error of ORB persists.... handling it here:

In [14]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
import cv2

# Load pre-trained deep learning model (ResNet-50)
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    # Convert PIL Image to PyTorch Tensor
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        # Extract features using ResNet
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def preprocess_image(image):
    """Preprocess the image for SIFT."""
    image_np = np.array(image)
    
    # Convert to grayscale
    gray_image = rgb2gray(image_np)
    
    # Convert to uint8 format
    gray_image = (gray_image * 255).astype(np.uint8)

    return gray_image

def match_keypoints(image1, image2):
    """Match keypoints between two images using SIFT."""
    # Initialize SIFT detector
    sift = cv2.SIFT_create()
    
    # Preprocess images
    image1_gray = preprocess_image(image1)
    image2_gray = preprocess_image(image2)
    
    # Detect keypoints and compute descriptors
    keypoints1, descriptors1 = sift.detectAndCompute(image1_gray, None)
    keypoints2, descriptors2 = sift.detectAndCompute(image2_gray, None)
    
    # If no descriptors are found, return 0 matches
    if descriptors1 is None or descriptors2 is None:
        return 0
    
    # Use BFMatcher to match descriptors
    bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
    matches = bf.match(descriptors1, descriptors2)
    
    return len(matches)

def find_most_similar_pdf(input_image_path, folder_path):
    """Finds the PDF with the most similar image to the input image."""
    input_image = Image.open(input_image_path)
    
    # Convert input image to RGB if necessary
    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)
    most_similar_pdf = None
    highest_similarity = -1

    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            extracted_images = extract_images_from_pdf(pdf_path)

            for img in extracted_images:
                img_features = extract_image_features(img)
                similarity = calculate_image_similarity(input_image_features, img_features)
                
                # Use keypoint matching with SIFT
                keypoint_match_count = match_keypoints(input_image, img)

                # Combine similarity and keypoint matches
                combined_score = similarity + (keypoint_match_count / 1000)  # Normalize keypoint matches
                if combined_score > highest_similarity:
                    highest_similarity = combined_score
                    most_similar_pdf = pdf_file

    return most_similar_pdf, highest_similarity

# Usage example
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//test.png"  # Provide the path to the input image
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Provide the path to the folder with PDFs

most_similar_pdf, similarity_score = find_most_similar_pdf(input_image_path, folder_path)

if most_similar_pdf:
    print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of: {similarity_score:.2f}")
else:
    print("No similar images found in the PDFs.")



The most similar PDF is: Simplex_LLB_4'_40W_4000K_100-277 V_TDS.pdf with a similarity score of: 0.86


Now this is what I call accuracy and the code for the Backbone 2!!!!!!

## Most similar PDF based on text and image

In [7]:
import os
import re
import io
import fitz  # PyMuPDF
import numpy as np
import cv2
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    doc.close()
    return text

# Function to preprocess text (example: remove extra spaces and newlines)
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Function to compare two texts and return similarity using cosine similarity
def compare_texts(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity([vectors[0]], [vectors[1]])[0][0]
    return similarity

# Function to find the most similar PDF based on text
def find_most_similar_pdf_text(input_pdf_path, folder_path):
    # Extract text from the input PDF
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))

    # Initialize variables to track the most similar PDF
    max_similarity = -1
    most_similar_pdf = None

    # Iterate over each PDF in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            # Extract text from the current PDF
            folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
            # Compute similarity
            similarity = compare_texts(input_text, folder_pdf_text)
            # Update most similar PDF if needed
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar_pdf = filename

    return most_similar_pdf, max_similarity

# Function to extract images from a PDF file
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            images.append(np.array(image))

    doc.close()
    return images

# Function to calculate the similarity between two images using histogram comparison
def calculate_image_similarity(img1, img2):
    # Convert images to BGR format if they are not already
    if len(img1.shape) == 2:  # Grayscale image
        img1 = cv2.cvtColor(img1, cv2.COLOR_GRAY2BGR)
    if len(img2.shape) == 2:  # Grayscale image
        img2 = cv2.cvtColor(img2, cv2.COLOR_GRAY2BGR)
        
    # Convert to grayscale for histogram comparison
    img1_gray = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
    img2_gray = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)

    # Calculate histogram and similarity
    hist1 = cv2.calcHist([img1_gray], [0], None, [256], [0, 256])
    hist2 = cv2.calcHist([img2_gray], [0], None, [256], [0, 256])
    
    similarity = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
    return similarity

# Function to find the most similar PDF based on an image
def find_most_similar_pdf_image(input_image_path, folder_path):
    input_image = cv2.imread(input_image_path)
    most_similar_pdf = None
    highest_similarity = -1

    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            extracted_images = extract_images_from_pdf(pdf_path)

            for img in extracted_images:
                similarity = calculate_image_similarity(input_image, img)

                if similarity > highest_similarity:
                    highest_similarity = similarity
                    most_similar_pdf = pdf_file

    return most_similar_pdf, highest_similarity

# Main function to handle both text-based and image-based PDF similarity search
if __name__ == "__main__":
    # Paths to the input PDF/image and the folder containing other PDFs
    input_pdf_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//lbk-configurable.pdf"  # Replace with the path to the input PDF
    input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//test.png"  # Provide the path to the input image
    folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Replace with the path to the folder containing PDFs

    # Find the most similar PDF based on text
    most_similar_pdf_text, similarity_text = find_most_similar_pdf_text(input_pdf_path, folder_path)
    
    if most_similar_pdf_text:
        print(f"The most similar PDF based on text is: {most_similar_pdf_text} with a similarity score of {similarity_text:.2f}")
    else:
        print("No similar PDF found based on text.")

    # Find the most similar PDF based on image
    most_similar_pdf_image, similarity_image = find_most_similar_pdf_image(input_image_path, folder_path)

    if most_similar_pdf_image:
        print(f"The most similar PDF based on image is: {most_similar_pdf_image} with a similarity score of: {similarity_image:.2f}")
    else:
        print("No similar images found in the PDFs.")

The most similar PDF based on text is: Delphi_FPCL_PS.pdf with a similarity score of 0.30
The most similar PDF based on image is: Sigma_EMTUBE_TypeB_PS.pdf with a similarity score of: 1.00


In [9]:
import os
import re
import io
import fitz  # PyMuPDF
import numpy as np
import cv2
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    doc.close()
    return text

# Function to preprocess text (example: remove extra spaces and newlines)
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Function to compare two texts and return similarity using cosine similarity
def compare_texts(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity([vectors[0]], [vectors[1]])[0][0]
    return similarity

# Function to find all similar PDFs based on text
def find_similar_pdfs_text(input_pdf_path, folder_path, similarity_threshold=0.1):
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))
    similar_pdfs = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
            similarity = compare_texts(input_text, folder_pdf_text)

            if similarity >= similarity_threshold:
                similar_pdfs.append((filename, similarity))

    return similar_pdfs

# Function to extract images from a PDF file
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            images.append(np.array(image))

    doc.close()
    return images

# Function to calculate the similarity between two images using histogram comparison
def calculate_image_similarity(img1, img2):
    if len(img1.shape) == 2:  # Grayscale image
        img1 = cv2.cvtColor(img1, cv2.COLOR_GRAY2BGR)
    if len(img2.shape) == 2:  # Grayscale image
        img2 = cv2.cvtColor(img2, cv2.COLOR_GRAY2BGR)

    img1_gray = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
    img2_gray = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)

    hist1 = cv2.calcHist([img1_gray], [0], None, [256], [0, 256])
    hist2 = cv2.calcHist([img2_gray], [0], None, [256], [0, 256])

    similarity = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
    return similarity

# Function to find all similar PDFs based on an image
def find_similar_pdfs_image(input_image_path, folder_path, similarity_threshold=0.1):
    input_image = cv2.imread(input_image_path)
    similar_pdfs = []

    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            extracted_images = extract_images_from_pdf(pdf_path)

            for img in extracted_images:
                similarity = calculate_image_similarity(input_image, img)

                if similarity >= similarity_threshold:
                    similar_pdfs.append((pdf_file, similarity))
                    break  # Break to avoid duplicate entries for the same PDF

    return similar_pdfs

# Main function to handle both text-based and image-based PDF similarity search
if __name__ == "__main__":
    input_pdf_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//lbk-configurable.pdf"
    input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//test.png"
    folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"
    similarity_threshold = 0.1  # Adjust this threshold as needed

    # Find all similar PDFs based on text
    similar_pdfs_text = find_similar_pdfs_text(input_pdf_path, folder_path, similarity_threshold)
    if similar_pdfs_text:
        print("Similar PDFs based on text:")
        for pdf, score in similar_pdfs_text:
            print(f"{pdf} with a similarity score of {score:.2f}")
    else:
        print("No similar PDFs found based on text.")

    # Find all similar PDFs based on image
    similar_pdfs_image = find_similar_pdfs_image(input_image_path, folder_path, similarity_threshold)
    if similar_pdfs_image:
        print("\nSimilar PDFs based on image:")
        for pdf, score in similar_pdfs_image:
            print(f"{pdf} with a similarity score of {score:.2f}")
    else:
        print("No similar PDFs found based on image.")

Similar PDFs based on text:
Coloris_RGBW_ELPL_PS.pdf with a similarity score of 0.26
Delphi_BLPL_PS.pdf with a similarity score of 0.25
Delphi_FPCL_PS.pdf with a similarity score of 0.30
Delphi_Mini_WP_PS.pdf with a similarity score of 0.28
ol2 mullion mount.pdf with a similarity score of 0.22
Orwin_DL_PS.pdf with a similarity score of 0.23
Sigma_EMTUBE_TypeB_PS.pdf with a similarity score of 0.30
T8_Tube_Type_C_PS.pdf with a similarity score of 0.26

Similar PDFs based on image:
Coloris_RGBW_ELPL_PS.pdf with a similarity score of 0.77
Delphi_BLPL_PS.pdf with a similarity score of 0.97
Delphi_FPCL_PS.pdf with a similarity score of 0.71
Delphi_Mini_WP_PS.pdf with a similarity score of 0.99
Orwin_DL_PS.pdf with a similarity score of 1.00
Sigma_EMTUBE_TypeB_PS.pdf with a similarity score of 1.00


Lets apply the changes and updated codes in Backbone 1 and Backbone 2 combined to give the desired changes.

In [49]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.feature import ORB, match_descriptors
from skimage.color import rgb2gray
import re
from transformers import BertTokenizer, BertModel

# Load pre-trained deep learning model (ResNet-50)
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

# Initialize the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    # Convert PIL Image to PyTorch Tensor
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        # Extract features using ResNet
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def match_keypoints(image1, image2):
    """Match keypoints between two images using ORB."""
    orb = ORB(n_keypoints=1000)
    image1_gray = rgb2gray(np.array(image1))
    image2_gray = rgb2gray(np.array(image2))
    
    orb.detect_and_extract(image1_gray)
    keypoints1 = orb.keypoints
    descriptors1 = orb.descriptors
    
    orb.detect_and_extract(image2_gray)
    keypoints2 = orb.keypoints
    descriptors2 = orb.descriptors
    
    matches = match_descriptors(descriptors1, descriptors2, cross_check=True)
    return len(matches)

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    return text

def preprocess_text(text):
    """Preprocesses text by removing extra spaces and newlines."""
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = text.strip()  # Remove leading and trailing spaces
    return text

def encode_text(text):
    """Encodes text using BERT model."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    # Use the [CLS] token's embedding as the sentence representation
    return outputs.last_hidden_state[:, 0, :].numpy()

def find_most_similar_pdf(input_path, folder_path, use_text=False):
    """Finds the PDF with the most similar image or text to the input."""
    if use_text:
        # Extract and preprocess text from the input PDF
        input_text = preprocess_text(extract_text_from_pdf(input_path))
        input_embedding = encode_text(input_text)
        
        # Initialize variables to track the most similar PDF
        max_similarity = -1
        most_similar_pdf = None

        # Preprocess all PDFs in the folder and batch encode
        pdf_embeddings = {}
        pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
        
        for filename in pdf_files:
            pdf_path = os.path.join(folder_path, filename)
            folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
            pdf_embeddings[filename] = encode_text(folder_pdf_text)

        # Compute cosine similarity for each PDF against the input PDF
        for filename, folder_pdf_embedding in pdf_embeddings.items():
            similarity = cosine_similarity(input_embedding, folder_pdf_embedding).item()
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar_pdf = filename

    else:
        # For image similarity
        input_image = Image.open(input_path)
        
        # Convert input image to RGB if necessary
        if input_image.mode != 'RGB':
            input_image = input_image.convert('RGB')

        input_image_features = extract_image_features(input_image)
        most_similar_pdf = None
        highest_similarity = -1
        best_match_count = 0

        for pdf_file in os.listdir(folder_path):
            if pdf_file.endswith(".pdf"):
                pdf_path = os.path.join(folder_path, pdf_file)
                extracted_images = extract_images_from_pdf(pdf_path)

                for img in extracted_images:
                    img_features = extract_image_features(img)
                    similarity = calculate_image_similarity(input_image_features, img_features)
                    keypoint_match_count = match_keypoints(input_image, img)

                    # Combine similarity and keypoint matches
                    combined_score = similarity + (keypoint_match_count / 1000)  # Normalize keypoint matches
                    if combined_score > highest_similarity:
                        highest_similarity = combined_score
                        most_similar_pdf = pdf_file

    return most_similar_pdf, max_similarity if use_text else highest_similarity

# Usage example for image similarity
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//test2.jpg"
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"

most_similar_pdf, similarity_score = find_most_similar_pdf(input_image_path, folder_path, use_text=False)

if most_similar_pdf:
    print(f"The most similar PDF on the basis of Image is: {most_similar_pdf} with a similarity score of: {similarity_score:.2f}")
else:
    print("No similar images found in the PDFs.")

# Usage example for text similarity
input_pdf_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//configurable-cpx.pdf"

most_similar_pdf, similarity = find_most_similar_pdf(input_pdf_path, folder_path, use_text=True)

if most_similar_pdf:
    print(f"The most similar PDF on the basis of Text is: {most_similar_pdf} with a similarity score of {similarity:.2f}")
else:
    print("No similar PDFs found.")



The most similar PDF on the basis of Image is: Coloris_RGBW_ELPL_PS.pdf with a similarity score of: 0.85
The most similar PDF on the basis of Text is: Delphi_FPCL_PS.pdf with a similarity score of 0.90


## Final Product

In [2]:
import os
import io
import re
import fitz  # PyMuPDF
import torch
import numpy as np
import cv2
import nltk
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from skimage.feature import ORB, match_descriptors
from skimage.color import rgb2gray
from sentence_transformers import SentenceTransformer, util
from concurrent.futures import ThreadPoolExecutor
from nltk import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import torchvision.transforms as transforms
from torchvision import models
from torchvision.models import ResNet50_Weights

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

# Initialize ResNet model with updated weights argument
resnet_model = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
resnet_model.eval()  # Set model to evaluation mode

# Initialize the Sentence-BERT model once at the start
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
lemmatizer = WordNetLemmatizer()

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    # Convert PIL Image to PyTorch Tensor
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        # Extract features using ResNet
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def match_keypoints(image1, image2):
    """Match keypoints between two images using ORB."""
    orb = ORB(n_keypoints=1000)
    image1_gray = rgb2gray(np.array(image1))
    image2_gray = rgb2gray(np.array(image2))
    
    orb.detect_and_extract(image1_gray)
    keypoints1 = orb.keypoints
    descriptors1 = orb.descriptors
    
    orb.detect_and_extract(image2_gray)
    keypoints2 = orb.keypoints
    descriptors2 = orb.descriptors
    
    matches = match_descriptors(descriptors1, descriptors2, cross_check=True)
    return len(matches)

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    with fitz.open(pdf_path) as doc:
        text = ""
        for page_num in range(doc.page_count):
            text += doc[page_num].get_text()
    return text

def preprocess_text(text):
    """Preprocesses text for NLP processing."""
    # Remove punctuation, convert to lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    
    # Tokenize and remove stopwords
    tokens = [word for word in word_tokenize(text) if word not in nltk.corpus.stopwords.words('english')]
    
    # Apply lemmatization
    lemmatized_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    
    return lemmatized_text.strip()

def generate_ngrams(text, n=2):
    """Generates n-grams from text."""
    tokens = word_tokenize(text)
    ngrams_list = list(ngrams(tokens, n))
    return [' '.join(ngram) for ngram in ngrams_list]

def compute_embedding(text):
    """Computes embeddings for a given text using Sentence-BERT."""
    # Split text into smaller chunks for more granular embeddings
    sentences = text.split('. ')
    embeddings = sentence_model.encode(sentences, convert_to_tensor=True)
    return embeddings

def compute_similarity(embedding1, embedding2):
    """Computes similarity score between two sets of embeddings."""
    cosine_sim = util.cos_sim(embedding1, embedding2)
    return cosine_sim.max().item()

def process_pdf(file_info):
    """Processes a single PDF file and calculates its similarity score."""
    input_embedding, input_pdf_path, pdf_path = file_info
    folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
    
    # Enrich text with bigrams
    bigrams = generate_ngrams(folder_pdf_text, 2)
    enriched_text = ' '.join([folder_pdf_text] + bigrams)
    
    folder_pdf_embedding = compute_embedding(enriched_text)
    
    # Compute similarity score
    similarity = compute_similarity(input_embedding, folder_pdf_embedding)
    return (pdf_path, similarity)

def find_most_similar_pdf_image(input_image_path, folder_path):
    """Finds the PDF with the most similar image to the input image."""
    input_image = Image.open(input_image_path)
    
    # Convert input image to RGB if necessary
    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)
    most_similar_pdf = None
    highest_similarity = -1
    best_match_count = 0

    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            extracted_images = extract_images_from_pdf(pdf_path)

            for img in extracted_images:
                img_features = extract_image_features(img)
                similarity = calculate_image_similarity(input_image_features, img_features)
                keypoint_match_count = match_keypoints(input_image, img)

                # Combine similarity and keypoint matches
                combined_score = similarity + (keypoint_match_count / 1000)  # Normalize keypoint matches
                if combined_score > highest_similarity:
                    highest_similarity = combined_score
                    most_similar_pdf = pdf_file

    return most_similar_pdf, highest_similarity

def find_most_similar_pdf_text(input_pdf_path, folder_path):
    """Finds the most similar PDF in a folder using Sentence-BERT embeddings."""
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))
    
    # Enrich text with bigrams
    bigrams = generate_ngrams(input_text, 2)
    enriched_text = ' '.join([input_text] + bigrams)
    
    input_embedding = compute_embedding(enriched_text)

    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]

    max_similarity = -1
    most_similar_pdf = None
    with ThreadPoolExecutor() as executor:
        results = executor.map(process_pdf, [(input_embedding, input_pdf_path, pdf_path) for pdf_path in pdf_files])

    for pdf_path, similarity in results:
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_pdf = os.path.basename(pdf_path)

    return most_similar_pdf, max_similarity

if __name__ == "__main__":
    # Paths for input files and folders
    input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//test3.jpg"
    input_pdf_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//configurable-cpx.pdf"
    folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"

    # Find the most similar PDF by image
    most_similar_pdf_image, image_similarity_score = find_most_similar_pdf_image(input_image_path, folder_path)
    if most_similar_pdf_image:
        print(f"The most similar PDF by image is: {most_similar_pdf_image} with a similarity score of: {image_similarity_score:.2f}")
    else:
        print("No similar images found in the PDFs.")

    # Find the most similar PDF by text
    most_similar_pdf_text, text_similarity_score = find_most_similar_pdf_text(input_pdf_path, folder_path)
    if most_similar_pdf_text:
        print(f"The most similar PDF by text is: {most_similar_pdf_text} with a similarity score of {text_similarity_score:.2f}")
    else:
        print("No similar PDF found by text.")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


RuntimeError: ORB found no features. Try passing in an image containing greater intensity contrasts between adjacent pixels.

The image program is not performing well due to increase in the database of the our light's pdf's. Therefore, certain changes in the contract enhancements and AI models were elaborately hindered with to get the following code vwith corresponding results:

In [16]:
import os
import io
import re
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
import cv2
from sentence_transformers import SentenceTransformer, util
from concurrent.futures import ThreadPoolExecutor
from nltk import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

# Initialize Sentence-BERT model and other required objects
model = SentenceTransformer('all-MiniLM-L6-v2')
lemmatizer = WordNetLemmatizer()

# Load pre-trained deep learning model (ResNet-50)
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

### Text Processing Functions ###

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    with fitz.open(pdf_path) as doc:
        text = ""
        for page_num in range(doc.page_count):
            text += doc[page_num].get_text()
    return text

def preprocess_text(text):
    """Preprocesses text by removing punctuation, lowercasing, and lemmatizing."""
    # Remove punctuation, convert to lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    
    # Tokenize and remove stopwords
    tokens = [word for word in word_tokenize(text) if word not in ENGLISH_STOP_WORDS]
    
    # Apply lemmatization
    lemmatized_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    
    return lemmatized_text.strip()

def generate_ngrams(text, n=2):
    """Generates n-grams from text."""
    tokens = word_tokenize(text)
    ngrams_list = list(ngrams(tokens, n))
    return [' '.join(ngram) for ngram in ngrams_list]

def compute_embedding(text):
    """Computes the embeddings of text using Sentence-BERT."""
    # Split text into smaller chunks for more granular embeddings
    sentences = text.split('. ')
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return embeddings

def compute_similarity(embedding1, embedding2):
    """Computes the similarity score between two sets of embeddings."""
    # Compute cosine similarity
    cosine_sim = util.cos_sim(embedding1, embedding2)
    return cosine_sim.max().item()  # Use max similarity across chunks

def process_pdf(file_info):
    """Processes a single PDF file and calculates its similarity score."""
    input_embedding, input_pdf_path, pdf_path = file_info
    folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
    
    # Enrich text with bigrams
    bigrams = generate_ngrams(folder_pdf_text, 2)
    enriched_text = ' '.join([folder_pdf_text] + bigrams)  # Concatenate original text with bigrams
    
    folder_pdf_embedding = compute_embedding(enriched_text)
    
    # Compute similarity score
    similarity = compute_similarity(input_embedding, folder_pdf_embedding)
    return (pdf_path, similarity)

### Image Processing Functions ###

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    # Convert PIL Image to PyTorch Tensor
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        # Extract features using ResNet
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def preprocess_image(image):
    """Preprocess the image for SIFT."""
    image_np = np.array(image)
    
    # Convert to grayscale
    gray_image = rgb2gray(image_np)
    
    # Convert to uint8 format
    gray_image = (gray_image * 255).astype(np.uint8)

    return gray_image

def match_keypoints(image1, image2):
    """Match keypoints between two images using SIFT."""
    # Initialize SIFT detector
    sift = cv2.SIFT_create()
    
    # Preprocess images
    image1_gray = preprocess_image(image1)
    image2_gray = preprocess_image(image2)
    
    # Detect keypoints and compute descriptors
    keypoints1, descriptors1 = sift.detectAndCompute(image1_gray, None)
    keypoints2, descriptors2 = sift.detectAndCompute(image2_gray, None)
    
    # If no descriptors are found, return 0 matches
    if descriptors1 is None or descriptors2 is None:
        return 0
    
    # Use BFMatcher to match descriptors
    bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
    matches = bf.match(descriptors1, descriptors2)
    
    return len(matches)

### Usage Examples ###

def find_most_similar_pdf_by_text(input_pdf_path, folder_path):
    """Finds the PDF with the most similar text content to the input PDF using Sentence-BERT embeddings."""
    # Extract and preprocess text from the input PDF
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))
    
    # Enrich text with bigrams
    bigrams = generate_ngrams(input_text, 2)
    enriched_text = ' '.join([input_text] + bigrams)  # Concatenate original text with bigrams
    
    input_embedding = compute_embedding(enriched_text)

    # List all PDF files in the folder
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]

    # Use ThreadPoolExecutor for parallel processing
    max_similarity = -1
    most_similar_pdf = None
    with ThreadPoolExecutor() as executor:
        results = executor.map(process_pdf, [(input_embedding, input_pdf_path, pdf_path) for pdf_path in pdf_files])

    # Process results to find the most similar PDF
    for pdf_path, similarity in results:
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_pdf = os.path.basename(pdf_path)

    return most_similar_pdf, max_similarity

def find_most_similar_pdf_by_image(input_image_path, folder_path):
    """Finds the PDF with the most similar image to the input image."""
    input_image = Image.open(input_image_path)
    
    # Convert input image to RGB if necessary
    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)
    most_similar_pdf = None
    highest_similarity = -1

    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            extracted_images = extract_images_from_pdf(pdf_path)

            for img in extracted_images:
                img_features = extract_image_features(img)
                similarity = calculate_image_similarity(input_image_features, img_features)
                
                # Use keypoint matching with SIFT
                keypoint_match_count = match_keypoints(input_image, img)

                # Combine similarity and keypoint matches
                combined_score = similarity + (keypoint_match_count / 1000)  # Normalize keypoint matches
                if combined_score > highest_similarity:
                    highest_similarity = combined_score
                    most_similar_pdf = pdf_file

    return most_similar_pdf, highest_similarity

# Main function to demonstrate usage
if __name__ == "__main__":
    # Paths for the input PDF, input image, and the folder containing other PDFs
    input_pdf_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//lbk-configurable.pdf"
    input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//test.png"
    folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"

    # Find the most similar PDF by text content
    most_similar_pdf_text, similarity_text = find_most_similar_pdf_by_text(input_pdf_path, folder_path)
    if most_similar_pdf_text:
        print(f"The most similar PDF by text is: {most_similar_pdf_text} with a similarity score of {similarity_text:.2f}")
    else:
        print("No similar PDF found by text.")

    # Find the most similar PDF by image content
    most_similar_pdf_image, similarity_image = find_most_similar_pdf_by_image(input_image_path, folder_path)
    if most_similar_pdf_image:
        print(f"The most similar PDF by image is: {most_similar_pdf_image} with a similarity score of {similarity_image:.2f}")
    else:
        print("No similar images found in the PDFs.")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The most similar PDF by text is: T8_Tube_Type_C_PS.pdf with a similarity score of 0.82
The most similar PDF by image is: Simplex_LLB_4'_40W_4000K_100-277 V_TDS.pdf with a similarity score of 0.86


## Extracting details of the similarities of the final pdf search through text.

In [67]:
from sentence_transformers import SentenceTransformer, util
import fitz  # PyMuPDF
import os
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from concurrent.futures import ThreadPoolExecutor
from nltk import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

# Initialize the Sentence-BERT model once at the start
model = SentenceTransformer('all-MiniLM-L6-v2')

lemmatizer = WordNetLemmatizer()

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page_num in range(doc.page_count):
            text += doc[page_num].get_text()
    return text

# Function to preprocess text
def preprocess_text(text):
    # Remove punctuation, convert to lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    
    # Tokenize and remove stopwords
    tokens = [word for word in word_tokenize(text) if word not in ENGLISH_STOP_WORDS]
    
    # Apply lemmatization
    lemmatized_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    
    return lemmatized_text.strip()

# Function to generate n-grams from text
def generate_ngrams(text, n=2):
    tokens = word_tokenize(text)
    ngrams_list = list(ngrams(tokens, n))
    return [' '.join(ngram) for ngram in ngrams_list]

# Function to compute embeddings for a list of sentences
def compute_embeddings(sentences):
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return embeddings

# Function to compute similarity score between all sentences in two documents
def find_similar_sentences(embedding1, sentences1, embedding2, sentences2):
    # Compute cosine similarity matrix
    cosine_sim_matrix = util.cos_sim(embedding1, embedding2)

    # Find the highest similarity score pairs
    most_similar_pairs = []
    for i in range(len(sentences1)):
        for j in range(len(sentences2)):
            similarity_score = cosine_sim_matrix[i][j].item()
            most_similar_pairs.append((similarity_score, sentences1[i], sentences2[j]))

    # Sort the pairs by similarity score in descending order
    most_similar_pairs = sorted(most_similar_pairs, key=lambda x: x[0], reverse=True)

    return most_similar_pairs

# Function to process a single PDF file and calculate its similarity details
def process_pdf(file_info):
    input_sentences, input_embedding, input_pdf_path, pdf_path = file_info
    folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
    
    # Split text into sentences
    folder_sentences = sent_tokenize(folder_pdf_text)

    # Compute embeddings for the folder PDF sentences
    folder_pdf_embedding = compute_embeddings(folder_sentences)
    
    # Find similar sentences between the input PDF and the current folder PDF
    similar_sentences = find_similar_sentences(input_embedding, input_sentences, folder_pdf_embedding, folder_sentences)
    
    # Return the most similar sentences and their similarity scores
    return (pdf_path, similar_sentences[:5])  # Return top 5 most similar sentences for brevity

# Function to find the most similar PDF in a folder using Sentence-BERT embeddings
def find_most_similar_pdf(input_pdf_path, folder_path):
    # Extract and preprocess text from the input PDF
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))
    
    # Split input text into sentences
    input_sentences = sent_tokenize(input_text)
    
    # Compute embeddings for input PDF sentences
    input_embedding = compute_embeddings(input_sentences)

    # List all PDF files in the folder
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]

    # Use ThreadPoolExecutor for parallel processing
    max_similarity = -1
    most_similar_pdf = None
    most_similar_details = None
    with ThreadPoolExecutor() as executor:
        results = executor.map(process_pdf, [(input_sentences, input_embedding, input_pdf_path, pdf_path) for pdf_path in pdf_files])

    # Process results to find the most similar PDF
    for pdf_path, similar_sentences in results:
        avg_similarity = sum([score for score, _, _ in similar_sentences]) / len(similar_sentences)
        if avg_similarity > max_similarity:
            max_similarity = avg_similarity
            most_similar_pdf = os.path.basename(pdf_path)
            most_similar_details = similar_sentences

    return most_similar_pdf, max_similarity, most_similar_details

# Main function to handle the PDF similarity search
if __name__ == "__main__":
    # Path to the input PDF and folder containing other PDFs
    input_pdf_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//lbk-configurable.pdf"  # Replace with the path to the input PDF
    folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Replace with the path to the folder containing PDFs

    # Find the most similar PDF
    most_similar_pdf, similarity, similar_details = find_most_similar_pdf(input_pdf_path, folder_path)

    if most_similar_pdf:
        print(f"The most similar PDF is: {most_similar_pdf} with an average similarity score of {similarity:.2f}")
        print("\nTop 5 most similar sentences or details:")
        for score, sent1, sent2 in similar_details:
            print(f"Score: {score:.2f}")
            print(f"Input PDF sentence: {sent1}")
            print(f"Similar PDF sentence: {sent2}\n")
    else:
        print("No similar PDF found.")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The most similar PDF is: T8_Tube_Type_C_PS.pdf with an average similarity score of 0.82

Top 5 most similar sentences or details:
Score: 0.82
Input PDF sentence: lbk lightbar kit led retrofit lbk commercial indoor lbk configurable catalog number note type feature specification intended use lbk kit provides simple costeffective integrated led solution retrofit nearly fluorescent fixture t8 t12 t5 lamp lightbar kit provide reliability thermal performance longevity integrated led system simplicity installing fluorescent lamp used existing fluorescent lampholders ballast used operation greatly increasing reliability construction optic rigid formed steel channel highperformance led board mounted directly optimal thermal performance diffuse acrylic lens increase uniformity control brightness lbk available standard 4 2 3 configuration increased flexibility linear application electrical longlife led coupled highefficiency driver provide extended service life lbk rated deliver greater l80 perfo

The output is not really understandable.

In [75]:
from sentence_transformers import SentenceTransformer, util
import fitz  # PyMuPDF
import os
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from concurrent.futures import ThreadPoolExecutor
from nltk import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

# Initialize the Sentence-BERT model once at the start
model = SentenceTransformer('all-MiniLM-L6-v2')
lemmatizer = WordNetLemmatizer()

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page_num in range(doc.page_count):
            text += doc[page_num].get_text()
    return text

# Function to preprocess text
def preprocess_text(text):
    # Remove punctuation, convert to lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    
    # Tokenize and remove stopwords
    tokens = [word for word in word_tokenize(text) if word not in ENGLISH_STOP_WORDS]
    
    # Apply lemmatization
    lemmatized_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    
    return lemmatized_text.strip()

# Function to generate n-grams from text
def generate_ngrams(text, n=2):
    tokens = word_tokenize(text)
    ngrams_list = list(ngrams(tokens, n))
    return [' '.join(ngram) for ngram in ngrams_list]

# Function to compute embeddings for a list of sentences
def compute_embeddings(sentences):
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return embeddings

# Function to compute similarity score between all sentences in two documents
def find_similar_sentences(embedding1, sentences1, embedding2, sentences2):
    # Compute cosine similarity matrix
    cosine_sim_matrix = util.cos_sim(embedding1, embedding2)

    # Find the highest similarity score pairs
    most_similar_pairs = []
    for i in range(len(sentences1)):
        for j in range(len(sentences2)):
            similarity_score = cosine_sim_matrix[i][j].item()
            most_similar_pairs.append((similarity_score, sentences1[i], sentences2[j]))

    # Sort the pairs by similarity score in descending order
    most_similar_pairs = sorted(most_similar_pairs, key=lambda x: x[0], reverse=True)

    return most_similar_pairs

# Function to process a single PDF file and calculate its similarity details
def process_pdf(file_info):
    input_sentences, input_embedding, input_pdf_path, pdf_path = file_info
    folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
    
    # Split text into sentences
    folder_sentences = sent_tokenize(folder_pdf_text)

    # Compute embeddings for the folder PDF sentences
    folder_pdf_embedding = compute_embeddings(folder_sentences)
    
    # Find similar sentences between the input PDF and the current folder PDF
    similar_sentences = find_similar_sentences(input_embedding, input_sentences, folder_pdf_embedding, folder_sentences)
    
    # Return the most similar sentences and their similarity scores
    return (pdf_path, similar_sentences[:5])  # Return top 5 most similar sentences for brevity

# Function to find the most similar PDF in a folder using Sentence-BERT embeddings
def find_most_similar_pdf(input_pdf_path, folder_path):
    # Extract and preprocess text from the input PDF
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))
    
    # Split input text into sentences
    input_sentences = sent_tokenize(input_text)
    
    # Compute embeddings for input PDF sentences
    input_embedding = compute_embeddings(input_sentences)

    # List all PDF files in the folder
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]

    # Use ThreadPoolExecutor for parallel processing
    max_similarity = -1
    most_similar_pdf = None
    most_similar_details = None
    with ThreadPoolExecutor() as executor:
        # Map the process_pdf function to each PDF file
        results = executor.map(process_pdf, [(input_sentences, input_embedding, input_pdf_path, pdf_path) for pdf_path in pdf_files])

        # Process results to find the most similar PDF
        for result in results:
            pdf_path, similar_sentences = result
            avg_similarity = sum([score for score, _, _ in similar_sentences]) / len(similar_sentences) if similar_sentences else 0
            if avg_similarity > max_similarity:
                max_similarity = avg_similarity
                most_similar_pdf = os.path.basename(pdf_path)
                most_similar_details = similar_sentences

    return most_similar_pdf, max_similarity, most_similar_details

# Function to format and display the similarity results
def format_similarity_results(input_pdf_path, most_similar_pdf, similarity, similar_details):
    print(f"The most similar PDF is: {most_similar_pdf} with an average similarity score of {similarity:.2f}\n")
    
    print("Top 5 most similar sentences or details:\n")
    
    for idx, (score, sent1, sent2) in enumerate(similar_details[:5], start=1):  # Limit to top 5
        print(f"Match {idx}:")
        print(f"Similarity Score: {score:.2f}")
        print(f"Input PDF Sentence: {sent1[:300]}...")  # Print the first 300 characters for brevity
        print(f"Similar PDF Sentence: {sent2[:300]}...")  # Print the first 300 characters for brevity
        print("-" * 80)

# Main function to handle the PDF similarity search
if __name__ == "__main__":
    # Path to the input PDF and folder containing other PDFs
    input_pdf_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//lbk-configurable.pdf"  # Replace with the path to the input PDF
    folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Replace with the path to the folder containing PDFs

    # Find the most similar PDF
    most_similar_pdf, similarity, similar_details = find_most_similar_pdf(input_pdf_path, folder_path)

    if most_similar_pdf:
        format_similarity_results(input_pdf_path, most_similar_pdf, similarity, similar_details)
    else:
        print("No similar PDF found.")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The most similar PDF is: T8_Tube_Type_C_PS.pdf with an average similarity score of 0.82

Top 5 most similar sentences or details:

Match 1:
Similarity Score: 0.82
Input PDF Sentence: lbk lightbar kit led retrofit lbk commercial indoor lbk configurable catalog number note type feature specification intended use lbk kit provides simple costeffective integrated led solution retrofit nearly fluorescent fixture t8 t12 t5 lamp lightbar kit provide reliability thermal performance longe...
Similar PDF Sentence: feature product sheet ikios t8 tube type c perfect looking retrofit outdated fluorescent ballast enhanced efficiency opting type b light luminaire includes toptier external driver integrated dimming functionality offsetting labor installation expense feature like highefficacy lighting assured compat...
--------------------------------------------------------------------------------


## UI

In [2]:
import os
import io
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter import ttk
from threading import Thread
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
import cv2
from sentence_transformers import SentenceTransformer, util
from concurrent.futures import ThreadPoolExecutor
from nltk import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import fitz  # PyMuPDF
import re

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

# Initialize Sentence-BERT model and other required objects
model = SentenceTransformer('all-MiniLM-L6-v2')
lemmatizer = WordNetLemmatizer()

# Load pre-trained deep learning model (ResNet-50)
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def preprocess_image(image):
    """Preprocess the image for SIFT."""
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    gray_image = (gray_image * 255).astype(np.uint8)
    return gray_image

def match_keypoints(image1, image2):
    """Match keypoints between two images using SIFT."""
    sift = cv2.SIFT_create()
    image1_gray = preprocess_image(image1)
    image2_gray = preprocess_image(image2)
    keypoints1, descriptors1 = sift.detectAndCompute(image1_gray, None)
    keypoints2, descriptors2 = sift.detectAndCompute(image2_gray, None)
    
    if descriptors1 is None or descriptors2 is None:
        return 0
    
    bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
    matches = bf.match(descriptors1, descriptors2)
    return len(matches)

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    with fitz.open(pdf_path) as doc:
        text = ""
        for page_num in range(doc.page_count):
            text += doc[page_num].get_text()
    return text

def preprocess_text(text):
    """Preprocesses text by removing punctuation, lowercasing, and lemmatizing."""
    text = re.sub(r'[^\w\s]', '', text.lower())
    text = re.sub(r'\s+', ' ', text)
    tokens = [word for word in word_tokenize(text) if word not in ENGLISH_STOP_WORDS]
    lemmatized_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    return lemmatized_text.strip()

def generate_ngrams(text, n=2):
    """Generates n-grams from text."""
    tokens = word_tokenize(text)
    ngrams_list = list(ngrams(tokens, n))
    return [' '.join(ngram) for ngram in ngrams_list]

def compute_embedding(text):
    """Computes the embeddings of text using Sentence-BERT."""
    sentences = text.split('. ')
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return embeddings

def compute_similarity(embedding1, embedding2):
    """Computes the similarity score between two sets of embeddings."""
    cosine_sim = util.cos_sim(embedding1, embedding2)
    return cosine_sim.max().item()

def process_pdf(file_info, input_embedding, update_progress, total_pdfs):
    """Processes a single PDF file and calculates its similarity score."""
    input_pdf_path, pdf_path, idx = file_info
    folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
    bigrams = generate_ngrams(folder_pdf_text, 2)
    enriched_text = ' '.join([folder_pdf_text] + bigrams)
    folder_pdf_embedding = compute_embedding(enriched_text)
    similarity = compute_similarity(input_embedding, folder_pdf_embedding)
    
    # Update progress
    progress = (idx + 1) / total_pdfs * 100
    update_progress(progress)
    
    return (pdf_path, similarity)

def find_most_similar_pdf_by_text(input_pdf_path, folder_path, update_progress):
    """Finds the PDF with the most similar text content to the input PDF using Sentence-BERT embeddings."""
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))
    bigrams = generate_ngrams(input_text, 2)
    enriched_text = ' '.join([input_text] + bigrams)
    input_embedding = compute_embedding(enriched_text)
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
    max_similarity = -1
    most_similar_pdf = None
    total_pdfs = len(pdf_files)
    
    with ThreadPoolExecutor() as executor:
        results = executor.map(lambda file_info: process_pdf(file_info, input_embedding, update_progress, total_pdfs),
                               [(input_pdf_path, pdf_path, idx) for idx, pdf_path in enumerate(pdf_files)])
    
    for pdf_path, similarity in results:
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_pdf = os.path.basename(pdf_path)
    
    return most_similar_pdf, max_similarity

def find_most_similar_pdf_by_image(input_image_path, folder_path, update_progress):
    """Finds the PDF with the most similar image to the input image."""
    input_image = Image.open(input_image_path)
    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')
    input_image_features = extract_image_features(input_image)
    
    most_similar_pdf = None
    highest_similarity = -1

    # Use os.walk to find all PDFs in the folder and its subfolders
    pdf_files = []
    for dirpath, _, filenames in os.walk(folder_path):
        for filename in filenames:
            if filename.endswith(".pdf"):
                pdf_files.append(os.path.join(dirpath, filename))

    total_pdfs = len(pdf_files)

    for idx, pdf_file in enumerate(pdf_files):
        pdf_path = os.path.join(folder_path, pdf_file)
        extracted_images = extract_images_from_pdf(pdf_path)
        for img in extracted_images:
            img_features = extract_image_features(img)
            similarity = calculate_image_similarity(input_image_features, img_features)
            keypoint_match_count = match_keypoints(input_image, img)
            combined_score = similarity + (keypoint_match_count / 1000)
            if combined_score > highest_similarity:
                highest_similarity = combined_score
                most_similar_pdf = pdf_file
        
        # Update progress
        progress = (idx + 1) / total_pdfs * 100
        update_progress(progress)

    return most_similar_pdf, highest_similarity

def select_image_file():
    file_path = filedialog.askopenfilename(
        title="Select Image File",
        filetypes=[("Image Files", "*.png;*.jpg;*.jpeg;*.bmp;*.tiff")]
    )
    if file_path:
        image_file_entry.delete(0, tk.END)
        image_file_entry.insert(0, file_path)
        display_image(file_path)

def select_pdf_file():
    file_path = filedialog.askopenfilename(
        title="Select PDF File",
        filetypes=[("PDF Files", "*.pdf")]
    )
    if file_path:
        pdf_file_entry.delete(0, tk.END)
        pdf_file_entry.insert(0, file_path)

def select_folder():
    folder_path = filedialog.askdirectory(title="Select Folder Containing PDFs")
    if folder_path:
        folder_entry.delete(0, tk.END)
        folder_entry.insert(0, folder_path)

def display_image(image_path):
    """Display the selected image on the UI."""
    image = Image.open(image_path)
    image.thumbnail((200, 200))  # Resize image to fit within 200x200 pixels
    photo = ImageTk.PhotoImage(image)
    image_label.config(image=photo)
    image_label.image = photo  # Keep a reference to avoid garbage collection

def update_progress_bar(progress):
    """Update the progress bar and status label."""
    progress_bar['value'] = progress
    status_label.config(text=f"Processing: {progress:.2f}%")

def start_image_similarity_search():
    input_image_path = image_file_entry.get()
    folder_path = folder_entry.get()
    if not input_image_path or not folder_path:
        messagebox.showwarning("Input Error", "Please provide both the image file and folder paths.")
        return
    
    def run_search():
        status_label.config(text="Processing image similarity...")
        progress_bar.config(mode="determinate")
        progress_bar['value'] = 0
        result, similarity = find_most_similar_pdf_by_image(input_image_path, folder_path, update_progress_bar)
        result_label.config(text=f"Most similar PDF on the basis of input image: {result} (Similarity: {similarity:.2f})")
        status_label.config(text="Search Complete!")
    
    Thread(target=run_search).start()

def start_text_similarity_search():
    input_pdf_path = pdf_file_entry.get()
    folder_path = folder_entry.get()
    if not input_pdf_path or not folder_path:
        messagebox.showwarning("Input Error", "Please provide both the PDF file and folder paths.")
        return
    
    def run_search():
        status_label.config(text="Processing text similarity...")
        progress_bar.config(mode="determinate")
        progress_bar['value'] = 0
        result, similarity = find_most_similar_pdf_by_text(input_pdf_path, folder_path, update_progress_bar)
        result_label.config(text=f"Most similar PDF on the basis of input text: {result} (Similarity: {similarity:.2f})")
        status_label.config(text="Search Complete!")
    
    Thread(target=run_search).start()

# Create the main Tkinter window
root = tk.Tk()
root.title("PDF Similarity Search")
root.geometry("600x500")
root.resizable(True, True)

# Image similarity section
image_file_label = ttk.Label(root, text="Select Image File:")
image_file_label.grid(row=0, column=0, padx=10, pady=10, sticky="e")
image_file_entry = ttk.Entry(root, width=40)
image_file_entry.grid(row=0, column=1, padx=10, pady=10)
image_file_button = ttk.Button(root, text="Browse", command=select_image_file)
image_file_button.grid(row=0, column=2, padx=10, pady=10)

# Display image
image_label = ttk.Label(root)
image_label.grid(row=1, column=0, columnspan=3, padx=10, pady=10)

# Text similarity section
pdf_file_label = ttk.Label(root, text="Select PDF File:")
pdf_file_label.grid(row=2, column=0, padx=10, pady=10, sticky="e")
pdf_file_entry = ttk.Entry(root, width=40)
pdf_file_entry.grid(row=2, column=1, padx=10, pady=10)
pdf_file_button = ttk.Button(root, text="Browse", command=select_pdf_file)
pdf_file_button.grid(row=2, column=2, padx=10, pady=10)

# Folder selection section
folder_label = ttk.Label(root, text="Select Folder Containing PDFs:")
folder_label.grid(row=3, column=0, padx=10, pady=10, sticky="e")
folder_entry = ttk.Entry(root, width=40)
folder_entry.grid(row=3, column=1, padx=10, pady=10)
folder_button = ttk.Button(root, text="Browse", command=select_folder)
folder_button.grid(row=3, column=2, padx=10, pady=10)

# Buttons to start similarity searches
image_similarity_button = ttk.Button(root, text="Find Most Similar PDF by Image", command=start_image_similarity_search)
image_similarity_button.grid(row=4, column=0, columnspan=3, padx=10, pady=10)

text_similarity_button = ttk.Button(root, text="Find Most Similar PDF by Text", command=start_text_similarity_search)
text_similarity_button.grid(row=5, column=0, columnspan=3, padx=10, pady=10)

# Result label
result_label = ttk.Label(root, text="")
result_label.grid(row=6, column=0, columnspan=3, padx=10, pady=10)

# Progress bar
progress_bar = ttk.Progressbar(root, orient="horizontal", mode="determinate", length=280)
progress_bar.grid(row=7, column=0, columnspan=3, padx=10, pady=10)

# Status label
status_label = ttk.Label(root, text="", wraplength=400)
status_label.grid(row=8, column=0, columnspan=3, padx=10, pady=10)

# Run the Tkinter main loop
root.mainloop()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# THE PROJECT IS COMPLETE

###### TnC: Consistent improvements are being made.