## What the Product Finder version 1 does is:

###### Creates a graphical user interface (GUI) for finding the most similar PDF to a given image or PDF using both text and image content similarity. Each seperately.

### The Version 2 aims at combining the two logics.

In [2]:
import os
import io
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter import ttk
from threading import Thread
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
import cv2
from sentence_transformers import SentenceTransformer, util
from concurrent.futures import ThreadPoolExecutor
from nltk import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import fitz  # PyMuPDF
import re
import multiprocessing

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

# Load more advanced Sentence-BERT model
model = SentenceTransformer('paraphrase-mpnet-base-v2')  # Larger model for better text similarity

# Use ResNet-101 for better image feature extraction
resnet_model = models.resnet101(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Preprocessing pipeline for images
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to 224x224 for ResNet
    transforms.ToTensor(),          # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Use multiprocessing for parallel image extraction
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert image to RGB
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet-101."""
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def calculate_image_similarity(features1, features2):
    """Calculates cosine similarity between two image feature vectors."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def match_keypoints(image1, image2):
    """Match keypoints using SIFT."""
    sift = cv2.SIFT_create()
    image1_gray = rgb2gray(np.array(image1))
    image2_gray = rgb2gray(np.array(image2))
    
    keypoints1, descriptors1 = sift.detectAndCompute(image1_gray, None)
    keypoints2, descriptors2 = sift.detectAndCompute(image2_gray, None)
    
    if descriptors1 is None or descriptors2 is None:
        return 0
    
    bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
    matches = bf.match(descriptors1, descriptors2)
    return len(matches)

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    with fitz.open(pdf_path) as doc:
        text = ""
        for page_num in range(doc.page_count):
            text += doc[page_num].get_text()
    return text

def preprocess_text(text):
    """Preprocesses text by removing punctuation, lowercasing, and lemmatizing."""
    text = re.sub(r'[^\w\s]', '', text.lower())
    text = re.sub(r'\s+', ' ', text)
    tokens = [word for word in word_tokenize(text) if word not in ENGLISH_STOP_WORDS]
    lemmatized_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    return lemmatized_text.strip()

def generate_ngrams(text, n=3):
    """Generates trigrams from text."""
    tokens = word_tokenize(text)
    ngrams_list = list(ngrams(tokens, n))
    return [' '.join(ngram) for ngram in ngrams_list]

def compute_embedding(text):
    """Computes the embeddings of text using Sentence-BERT."""
    sentences = text.split('. ')
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return embeddings

def compute_similarity(embedding1, embedding2):
    """Computes the cosine similarity score between two sets of embeddings."""
    cosine_sim = util.cos_sim(embedding1, embedding2)
    return cosine_sim.max().item()

def process_pdf(file_info, input_embedding, input_image_features, update_progress, total_pdfs, weight_text=0.7, weight_image=0.3):
    """Processes a PDF file to calculate text and image similarity."""
    input_pdf_path, pdf_path, idx = file_info
    folder_pdf_text = preprocess_text(extract_text_from_pdf(pdf_path))
    trigrams = generate_ngrams(folder_pdf_text, 3)
    enriched_text = ' '.join([folder_pdf_text] + trigrams)
    folder_pdf_embedding = compute_embedding(enriched_text)
    
    # Text similarity
    text_similarity = compute_similarity(input_embedding, folder_pdf_embedding)
    
    # Image similarity
    extracted_images = extract_images_from_pdf(pdf_path)
    highest_image_similarity = -1
    
    for img in extracted_images:
        img_features = extract_image_features(img)
        similarity = calculate_image_similarity(input_image_features, img_features)
        keypoint_match_count = match_keypoints(img, img)
        combined_image_score = similarity + (keypoint_match_count / 1000)
        
        if combined_image_score > highest_image_similarity:
            highest_image_similarity = combined_image_score

    # Combine weighted scores
    combined_similarity = (text_similarity * weight_text) + (highest_image_similarity * weight_image)
    
    # Update progress
    progress = (idx + 1) / total_pdfs * 100
    update_progress(progress)
    
    return (pdf_path, combined_similarity)

def find_most_similar_pdf(input_pdf_path, input_image_path, folder_path, update_progress, weight_text=0.7, weight_image=0.3):
    """Finds the most similar PDF considering both text and image similarity."""
    input_text = preprocess_text(extract_text_from_pdf(input_pdf_path))
    trigrams = generate_ngrams(input_text, 3)
    enriched_text = ' '.join([input_text] + trigrams)
    input_embedding = compute_embedding(enriched_text)

    input_image = Image.open(input_image_path)
    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')
    input_image_features = extract_image_features(input_image)
    
    folder_pdfs = [os.path.join(folder_path, pdf) for pdf in os.listdir(folder_path) if pdf.endswith('.pdf')]
    total_pdfs = len(folder_pdfs)
    
    results = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_pdf = {executor.submit(process_pdf, (input_pdf_path, pdf, idx), input_embedding, input_image_features, update_progress, total_pdfs): pdf for idx, pdf in enumerate(folder_pdfs)}
        for future in future_to_pdf:
            results.append(future.result())
    
    # Find PDF with the highest combined similarity
    most_similar_pdf = max(results, key=lambda x: x[1])
    
    return most_similar_pdf

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
