## Finetuning the Image Similarity Part.

This is the initial Image Similarity finder code. It is apparently not in its optimum condition.

In [1]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
import cv2

# Load pre-trained deep learning model (ResNet-50)
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    # Convert PIL Image to PyTorch Tensor
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        # Extract features using ResNet
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def preprocess_image(image):
    """Preprocess the image for SIFT."""
    image_np = np.array(image)
    
    # Convert to grayscale
    gray_image = rgb2gray(image_np)
    
    # Convert to uint8 format
    gray_image = (gray_image * 255).astype(np.uint8)

    return gray_image

def match_keypoints(image1, image2):
    """Match keypoints between two images using SIFT."""
    # Initialize SIFT detector
    sift = cv2.SIFT_create()
    
    # Preprocess images
    image1_gray = preprocess_image(image1)
    image2_gray = preprocess_image(image2)
    
    # Detect keypoints and compute descriptors
    keypoints1, descriptors1 = sift.detectAndCompute(image1_gray, None)
    keypoints2, descriptors2 = sift.detectAndCompute(image2_gray, None)
    
    # If no descriptors are found, return 0 matches
    if descriptors1 is None or descriptors2 is None:
        return 0
    
    # Use BFMatcher to match descriptors
    bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
    matches = bf.match(descriptors1, descriptors2)
    
    return len(matches)

def find_most_similar_pdf(input_image_path, folder_path):
    """Finds the PDF with the most similar image to the input image."""
    input_image = Image.open(input_image_path)
    
    # Convert input image to RGB if necessary
    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)
    most_similar_pdf = None
    highest_similarity = -1

    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            extracted_images = extract_images_from_pdf(pdf_path)

            for img in extracted_images:
                img_features = extract_image_features(img)
                similarity = calculate_image_similarity(input_image_features, img_features)
                
                # Use keypoint matching with SIFT
                keypoint_match_count = match_keypoints(input_image, img)

                # Combine similarity and keypoint matches
                combined_score = similarity + (keypoint_match_count / 1000)  # Normalize keypoint matches
                if combined_score > highest_similarity:
                    highest_similarity = combined_score
                    most_similar_pdf = pdf_file

    return most_similar_pdf, highest_similarity

# Usage example
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//Wall_Pack_Two.jpg"  # Provide the path to the input image
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Provide the path to the folder with PDFs

most_similar_pdf, similarity_score = find_most_similar_pdf(input_image_path, folder_path)

if most_similar_pdf:
    print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of: {similarity_score:.2f}")
else:
    print("No similar images found in the PDFs.")



The most similar PDF is: Rodella_UFO_Black_150W_3500K_100-277 V_TDS.pdf with a similarity score of: 1.29


## ORB instead of SIFT

In [4]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
import cv2
from concurrent.futures import ThreadPoolExecutor

# Load pre-trained deep learning model (ResNet-50)
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    # Convert PIL Image to PyTorch Tensor
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        # Extract features using ResNet
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def extract_image_features_batch(images):
    """Extracts deep learning features from a batch of images using ResNet."""
    input_tensors = torch.stack([preprocess(image) for image in images])  # Create a batch of images
    with torch.no_grad():
        features_batch = resnet_model(input_tensors)
    return features_batch.numpy()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def preprocess_image(image):
    """Preprocess the image for ORB."""
    image_np = np.array(image)
    
    # Convert to grayscale
    gray_image = rgb2gray(image_np)
    
    # Convert to uint8 format
    gray_image = (gray_image * 255).astype(np.uint8)

    return gray_image

def match_keypoints(image1, image2):
    """Match keypoints between two images using ORB."""
    # Initialize ORB detector
    orb = cv2.ORB_create()
    
    # Preprocess images
    image1_gray = preprocess_image(image1)
    image2_gray = preprocess_image(image2)
    
    # Detect keypoints and compute descriptors
    keypoints1, descriptors1 = orb.detectAndCompute(image1_gray, None)
    keypoints2, descriptors2 = orb.detectAndCompute(image2_gray, None)
    
    # If no descriptors are found, return 0 matches
    if descriptors1 is None or descriptors2 is None:
        return 0
    
    # Use BFMatcher with Hamming distance (since ORB uses binary descriptors)
    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    matches = bf.match(descriptors1, descriptors2)
    
    return len(matches)

def process_pdf_in_parallel(pdf_file, input_image_features, input_image, folder_path):
    """Processes a single PDF file to find the most similar image."""
    pdf_path = os.path.join(folder_path, pdf_file)
    extracted_images = extract_images_from_pdf(pdf_path)

    for img in extracted_images:
        img_features = extract_image_features(img)
        similarity = calculate_image_similarity(input_image_features, img_features)

        # Use keypoint matching with ORB
        keypoint_match_count = match_keypoints(input_image, img)

        # Combine similarity and keypoint matches
        combined_score = similarity + (keypoint_match_count / 1000)  # Normalize keypoint matches
        return pdf_file, combined_score
    
    return pdf_file, -1  # Default score if no image found

def find_most_similar_pdf(input_image_path, folder_path):
    """Finds the PDF with the most similar image to the input image."""
    input_image = Image.open(input_image_path)
    
    # Convert input image to RGB if necessary
    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)
    most_similar_pdf = None
    highest_similarity = -1

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_pdf_in_parallel, pdf_file, input_image_features, input_image, folder_path)
                   for pdf_file in os.listdir(folder_path) if pdf_file.endswith(".pdf")]
        results = [f.result() for f in futures]

    # Get the PDF with the highest combined score
    most_similar_pdf, highest_similarity = max(results, key=lambda x: x[1])

    return most_similar_pdf, highest_similarity

# Usage example
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//81YkGfZe7PL._AC_UF1000,1000_QL80_.jpg"  # Provide the path to the input image
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Provide the path to the folder with PDFs

most_similar_pdf, similarity_score = find_most_similar_pdf(input_image_path, folder_path)

if most_similar_pdf:
    print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of: {similarity_score:.2f}")
else:
    print("No similar images found in the PDFs.")



The most similar PDF is: Delphi_WP_SC_21W28W34.98W_DLC_TDS.pdf with a similarity score of: 0.91


## AKAZE instead of SIFT or ORB

In [5]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
import cv2
from concurrent.futures import ThreadPoolExecutor

# Load pre-trained deep learning model (ResNet-50)
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    # Convert PIL Image to PyTorch Tensor
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        # Extract features using ResNet
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def extract_image_features_batch(images):
    """Extracts deep learning features from a batch of images using ResNet."""
    input_tensors = torch.stack([preprocess(image) for image in images])  # Create a batch of images
    with torch.no_grad():
        features_batch = resnet_model(input_tensors)
    return features_batch.numpy()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def preprocess_image(image):
    """Preprocess the image for AKAZE."""
    image_np = np.array(image)
    
    # Convert to grayscale
    gray_image = rgb2gray(image_np)
    
    # Convert to uint8 format
    gray_image = (gray_image * 255).astype(np.uint8)

    return gray_image

def match_keypoints(image1, image2):
    """Match keypoints between two images using AKAZE."""
    # Initialize AKAZE detector
    akaze = cv2.AKAZE_create()
    
    # Preprocess images
    image1_gray = preprocess_image(image1)
    image2_gray = preprocess_image(image2)
    
    # Detect keypoints and compute descriptors
    keypoints1, descriptors1 = akaze.detectAndCompute(image1_gray, None)
    keypoints2, descriptors2 = akaze.detectAndCompute(image2_gray, None)
    
    # If no descriptors are found, return 0 matches
    if descriptors1 is None or descriptors2 is None:
        return 0
    
    # Use BFMatcher to match descriptors (Hamming norm is used for binary descriptors)
    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    matches = bf.match(descriptors1, descriptors2)
    
    return len(matches)

def process_pdf_in_parallel(pdf_file, input_image_features, input_image, folder_path):
    """Processes a single PDF file to find the most similar image."""
    pdf_path = os.path.join(folder_path, pdf_file)
    extracted_images = extract_images_from_pdf(pdf_path)

    for img in extracted_images:
        img_features = extract_image_features(img)
        similarity = calculate_image_similarity(input_image_features, img_features)

        # Use keypoint matching with AKAZE
        keypoint_match_count = match_keypoints(input_image, img)

        # Combine similarity and keypoint matches
        combined_score = similarity + (keypoint_match_count / 1000)  # Normalize keypoint matches
        return pdf_file, combined_score
    
    return pdf_file, -1  # Default score if no image found

def find_most_similar_pdf(input_image_path, folder_path):
    """Finds the PDF with the most similar image to the input image."""
    input_image = Image.open(input_image_path)
    
    # Convert input image to RGB if necessary
    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)
    most_similar_pdf = None
    highest_similarity = -1

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_pdf_in_parallel, pdf_file, input_image_features, input_image, folder_path)
                   for pdf_file in os.listdir(folder_path) if pdf_file.endswith(".pdf")]
        results = [f.result() for f in futures]

    # Get the PDF with the highest combined score
    most_similar_pdf, highest_similarity = max(results, key=lambda x: x[1])

    return most_similar_pdf, highest_similarity

# Usage example
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//81YkGfZe7PL._AC_UF1000,1000_QL80_.jpg"  # Provide the path to the input image
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Provide the path to the folder with PDFs

most_similar_pdf, similarity_score = find_most_similar_pdf(input_image_path, folder_path)

if most_similar_pdf:
    print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of: {similarity_score:.2f}")
else:
    print("No similar images found in the PDFs.")



The most similar PDF is: Rodella_UFO_Black_150W_3500K_100-277 V_TDS.pdf with a similarity score of: 1.07


## BRISK

In [7]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
import cv2
from concurrent.futures import ThreadPoolExecutor

# Load pre-trained deep learning model (ResNet-50)
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    # Convert PIL Image to PyTorch Tensor
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        # Extract features using ResNet
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def extract_image_features_batch(images):
    """Extracts deep learning features from a batch of images using ResNet."""
    input_tensors = torch.stack([preprocess(image) for image in images])  # Create a batch of images
    with torch.no_grad():
        features_batch = resnet_model(input_tensors)
    return features_batch.numpy()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def preprocess_image(image):
    """Preprocess the image for BRISK."""
    image_np = np.array(image)
    
    # Convert to grayscale
    gray_image = rgb2gray(image_np)
    
    # Convert to uint8 format
    gray_image = (gray_image * 255).astype(np.uint8)

    return gray_image

def match_keypoints(image1, image2):
    """Match keypoints between two images using BRISK."""
    # Initialize BRISK detector
    brisk = cv2.BRISK_create()
    
    # Preprocess images
    image1_gray = preprocess_image(image1)
    image2_gray = preprocess_image(image2)
    
    # Detect keypoints and compute descriptors
    keypoints1, descriptors1 = brisk.detectAndCompute(image1_gray, None)
    keypoints2, descriptors2 = brisk.detectAndCompute(image2_gray, None)
    
    # If no descriptors are found, return 0 matches
    if descriptors1 is None or descriptors2 is None:
        return 0
    
    # Use BFMatcher to match descriptors (Hamming norm is used for binary descriptors)
    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    matches = bf.match(descriptors1, descriptors2)
    
    return len(matches)

def process_pdf_in_parallel(pdf_file, input_image_features, input_image, folder_path):
    """Processes a single PDF file to find the most similar image."""
    pdf_path = os.path.join(folder_path, pdf_file)
    extracted_images = extract_images_from_pdf(pdf_path)

    for img in extracted_images:
        img_features = extract_image_features(img)
        similarity = calculate_image_similarity(input_image_features, img_features)

        # Use keypoint matching with BRISK
        keypoint_match_count = match_keypoints(input_image, img)

        # Combine similarity and keypoint matches
        combined_score = similarity + (keypoint_match_count / 1000)  # Normalize keypoint matches
        return pdf_file, combined_score
    
    return pdf_file, -1  # Default score if no image found

def find_most_similar_pdf(input_image_path, folder_path):
    """Finds the PDF with the most similar image to the input image."""
    input_image = Image.open(input_image_path)
    
    # Convert input image to RGB if necessary
    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)
    most_similar_pdf = None
    highest_similarity = -1

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_pdf_in_parallel, pdf_file, input_image_features, input_image, folder_path)
                   for pdf_file in os.listdir(folder_path) if pdf_file.endswith(".pdf")]
        results = [f.result() for f in futures]

    # Get the PDF with the highest combined score
    most_similar_pdf, highest_similarity = max(results, key=lambda x: x[1])

    return most_similar_pdf, highest_similarity

# Usage example
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//81YkGfZe7PL._AC_UF1000,1000_QL80_.jpg"  # Provide the path to the input image
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Provide the path to the folder with PDFs

most_similar_pdf, similarity_score = find_most_similar_pdf(input_image_path, folder_path)

if most_similar_pdf:
    print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of: {similarity_score:.2f}")
else:
    print("No similar images found in the PDFs.")



The most similar PDF is: Forza Explosion Proof Square High Bay.pdf with a similarity score of: 1.80


## FAST and FREAK

In [8]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
import cv2
from concurrent.futures import ThreadPoolExecutor

# Load pre-trained deep learning model (ResNet-50)
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    # Convert PIL Image to PyTorch Tensor
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        # Extract features using ResNet
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def extract_image_features_batch(images):
    """Extracts deep learning features from a batch of images using ResNet."""
    input_tensors = torch.stack([preprocess(image) for image in images])  # Create a batch of images
    with torch.no_grad():
        features_batch = resnet_model(input_tensors)
    return features_batch.numpy()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def preprocess_image(image):
    """Preprocess the image for FAST and FREAK."""
    image_np = np.array(image)
    
    # Convert to grayscale
    gray_image = rgb2gray(image_np)
    
    # Convert to uint8 format
    gray_image = (gray_image * 255).astype(np.uint8)

    return gray_image

def match_keypoints(image1, image2):
    """Match keypoints between two images using FAST and FREAK."""
    # Initialize FAST detector
    fast = cv2.FastFeatureDetector_create()

    # Initialize FREAK descriptor
    freak = cv2.xfeatures2d.FREAK_create()
    
    # Preprocess images
    image1_gray = preprocess_image(image1)
    image2_gray = preprocess_image(image2)
    
    # Detect keypoints using FAST
    keypoints1 = fast.detect(image1_gray, None)
    keypoints2 = fast.detect(image2_gray, None)

    # Compute FREAK descriptors
    keypoints1, descriptors1 = freak.compute(image1_gray, keypoints1)
    keypoints2, descriptors2 = freak.compute(image2_gray, keypoints2)
    
    # If no descriptors are found, return 0 matches
    if descriptors1 is None or descriptors2 is None:
        return 0
    
    # Use BFMatcher to match descriptors (Hamming norm is used for binary descriptors)
    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    matches = bf.match(descriptors1, descriptors2)
    
    return len(matches)

def process_pdf_in_parallel(pdf_file, input_image_features, input_image, folder_path):
    """Processes a single PDF file to find the most similar image."""
    pdf_path = os.path.join(folder_path, pdf_file)
    extracted_images = extract_images_from_pdf(pdf_path)

    for img in extracted_images:
        img_features = extract_image_features(img)
        similarity = calculate_image_similarity(input_image_features, img_features)

        # Use keypoint matching with FAST and FREAK
        keypoint_match_count = match_keypoints(input_image, img)

        # Combine similarity and keypoint matches
        combined_score = similarity + (keypoint_match_count / 1000)  # Normalize keypoint matches
        return pdf_file, combined_score
    
    return pdf_file, -1  # Default score if no image found

def find_most_similar_pdf(input_image_path, folder_path):
    """Finds the PDF with the most similar image to the input image."""
    input_image = Image.open(input_image_path)
    
    # Convert input image to RGB if necessary
    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)
    most_similar_pdf = None
    highest_similarity = -1

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_pdf_in_parallel, pdf_file, input_image_features, input_image, folder_path)
                   for pdf_file in os.listdir(folder_path) if pdf_file.endswith(".pdf")]
        results = [f.result() for f in futures]

    # Get the PDF with the highest combined score
    most_similar_pdf, highest_similarity = max(results, key=lambda x: x[1])

    return most_similar_pdf, highest_similarity

# Usage example
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//Wall_Pack_Two.jpg"  # Provide the path to the input image
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Provide the path to the folder with PDFs

most_similar_pdf, similarity_score = find_most_similar_pdf(input_image_path, folder_path)

if most_similar_pdf:
    print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of: {similarity_score:.2f}")
else:
    print("No similar images found in the PDFs.")



The most similar PDF is: Rodella_UFO_Black_150W_3500K_100-277 V_TDS.pdf with a similarity score of: 2.53


## KAZE

In [10]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
import cv2
from concurrent.futures import ThreadPoolExecutor

# Load pre-trained deep learning model (ResNet-50)
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set model to evaluation mode

# Image preprocessing transformations for ResNet
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by ResNet
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet standards
])

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_image_features(image):
    """Extracts deep learning features from an image using ResNet."""
    # Convert PIL Image to PyTorch Tensor
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        # Extract features using ResNet
        features = resnet_model(input_batch)
    
    return features.numpy().flatten()

def extract_image_features_batch(images):
    """Extracts deep learning features from a batch of images using ResNet."""
    input_tensors = torch.stack([preprocess(image) for image in images])  # Create a batch of images
    with torch.no_grad():
        features_batch = resnet_model(input_tensors)
    return features_batch.numpy()

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def preprocess_image(image):
    """Preprocess the image for KAZE."""
    image_np = np.array(image)
    
    # Convert to grayscale
    gray_image = rgb2gray(image_np)
    
    # Convert to uint8 format
    gray_image = (gray_image * 255).astype(np.uint8)

    return gray_image

def match_keypoints(image1, image2):
    """Match keypoints between two images using KAZE."""
    # Initialize KAZE detector
    kaze = cv2.KAZE_create()

    # Preprocess images
    image1_gray = preprocess_image(image1)
    image2_gray = preprocess_image(image2)

    # Detect keypoints and compute descriptors
    keypoints1, descriptors1 = kaze.detectAndCompute(image1_gray, None)
    keypoints2, descriptors2 = kaze.detectAndCompute(image2_gray, None)

    # If no descriptors are found, return 0 matches
    if descriptors1 is None or descriptors2 is None:
        return 0
    
    # Use BFMatcher to match descriptors (L2 norm is typically used for KAZE)
    bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
    matches = bf.match(descriptors1, descriptors2)

    return len(matches)

def process_pdf_in_parallel(pdf_file, input_image_features, input_image, folder_path):
    """Processes a single PDF file to find the most similar image."""
    pdf_path = os.path.join(folder_path, pdf_file)
    extracted_images = extract_images_from_pdf(pdf_path)

    for img in extracted_images:
        img_features = extract_image_features(img)
        similarity = calculate_image_similarity(input_image_features, img_features)

        # Use keypoint matching with KAZE
        keypoint_match_count = match_keypoints(input_image, img)

        # Combine similarity and keypoint matches
        combined_score = similarity + (keypoint_match_count / 1000)  # Normalize keypoint matches
        return pdf_file, combined_score
    
    return pdf_file, -1  # Default score if no image found

def find_most_similar_pdf(input_image_path, folder_path):
    """Finds the PDF with the most similar image to the input image."""
    input_image = Image.open(input_image_path)
    
    # Convert input image to RGB if necessary
    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)
    most_similar_pdf = None
    highest_similarity = -1

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_pdf_in_parallel, pdf_file, input_image_features, input_image, folder_path)
                   for pdf_file in os.listdir(folder_path) if pdf_file.endswith(".pdf")]
        results = [f.result() for f in futures]

    # Get the PDF with the highest combined score
    most_similar_pdf, highest_similarity = max(results, key=lambda x: x[1])

    return most_similar_pdf, highest_similarity

# Usage example
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//Wall_Pack_Two.jpg"  # Provide the path to the input image
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Provide the path to the folder with PDFs

most_similar_pdf, similarity_score = find_most_similar_pdf(input_image_path, folder_path)

if most_similar_pdf:
    print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of: {similarity_score:.2f}")
else:
    print("No similar images found in the PDFs.")



The most similar PDF is: Magnus_AFL_100W150W205W_TDS.pdf with a similarity score of: 0.77


## BACKBONE 1: Image Similarity
#### The other models are not getting the right results, therefore we start working on the existing model itself.

In [4]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
from skimage.feature import hog
import cv2

# Load the pre-trained VGG-19 model
vgg_model = models.vgg19(pretrained=True)
vgg_model.eval()  # Set the model to evaluation mode

# Image preprocessing transformations for VGG
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by the model
    transforms.ToTensor(),          # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # Normalize using ImageNet standards
                         std=[0.229, 0.224, 0.225])
])

def extract_images_from_pdf(pdf_path):
    """Extracts images from a PDF file."""
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Convert the image to RGB format
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_intermediate_features(image, model, layer):
    """Extract intermediate deep learning features from an image."""
    # Dictionary to store the output
    activation = {}

    def hook_fn(module, input, output):
        activation['output'] = output

    # Register hook for intermediate layer
    handle = model._modules.get(layer).register_forward_hook(hook_fn)

    # Preprocess image
    input_tensor = preprocess(image).unsqueeze(0)

    # Forward pass
    with torch.no_grad():
        model(input_tensor)

    # Get the features from the hook
    intermediate_features = activation['output']

    # Remove the hook
    handle.remove()

    return intermediate_features.flatten().numpy()

def extract_image_features(image):
    """Extracts deep and hand-crafted features."""
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')  # Early VGG features
    hog_features = extract_hog_features(image)                                  # HOG features
    return np.concatenate([vgg_features, hog_features])

def calculate_image_similarity(features1, features2):
    """Calculates the similarity between two image feature vectors using cosine similarity."""
    similarity = cosine_similarity([features1], [features2])[0][0]
    return similarity

def preprocess_image(image):
    """Preprocess the image for keypoint detectors."""
    # Resize image to a fixed size
    image = image.resize((300, 300))  # You can adjust the size as needed
    image_np = np.array(image)
    
    # Convert to grayscale
    gray_image = rgb2gray(image_np)
    
    # Convert to uint8 format
    gray_image = (gray_image * 255).astype(np.uint8)

    return gray_image

def match_keypoints(image1, image2):
    """Match keypoints between two images using KAZE and ORB detectors."""
    # Initialize KAZE and ORB detectors
    kaze = cv2.KAZE_create()
    orb = cv2.ORB_create()

    # Preprocess images
    image1_gray = preprocess_image(image1)
    image2_gray = preprocess_image(image2)

    # Detect keypoints and compute descriptors with both KAZE and ORB
    keypoints1_kaze, descriptors1_kaze = kaze.detectAndCompute(image1_gray, None)
    keypoints2_kaze, descriptors2_kaze = kaze.detectAndCompute(image2_gray, None)

    keypoints1_orb, descriptors1_orb = orb.detectAndCompute(image1_gray, None)
    keypoints2_orb, descriptors2_orb = orb.detectAndCompute(image2_gray, None)

    # Initialize match counts
    matches_kaze_count = 0
    matches_orb_count = 0

    # Match KAZE descriptors
    if descriptors1_kaze is not None and descriptors2_kaze is not None:
        bf_kaze = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
        matches_kaze = bf_kaze.match(descriptors1_kaze, descriptors2_kaze)
        matches_kaze_count = len(matches_kaze)

    # Match ORB descriptors
    if descriptors1_orb is not None and descriptors2_orb is not None:
        bf_orb = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
        matches_orb = bf_orb.match(descriptors1_orb, descriptors2_orb)
        matches_orb_count = len(matches_orb)

    # Return the combined number of matches
    return matches_kaze_count + matches_orb_count

def extract_hog_features(image):
    """Extracts Histogram of Oriented Gradients (HOG) features from an image."""
    # Resize image to a fixed size
    image = image.resize((128, 128))  # Ensure all images are the same size
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)

    # Compute HOG descriptors
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    
    return hog_features

def find_most_similar_pdf(input_image_path, folder_path):
    """Finds the PDF with the most similar image to the input image."""
    input_image = Image.open(input_image_path)

    # Convert input image to RGB if necessary
    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    # Extract features from the input image using VGG and HOG
    input_image_features = extract_image_features(input_image)

    most_similar_pdf = None
    highest_similarity = -1

    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            extracted_images = extract_images_from_pdf(pdf_path)

            for img in extracted_images:
                # Convert image to RGB if necessary
                if img.mode != 'RGB':
                    img = img.convert('RGB')

                # Extract features using VGG and HOG
                img_features = extract_image_features(img)

                # Calculate cosine similarity
                similarity = calculate_image_similarity(input_image_features, img_features)
                
                # Use keypoint matching with KAZE and ORB
                keypoint_match_count = match_keypoints(input_image, img)

                # Combine similarity and keypoint matches (with appropriate weights)
                combined_score = (0.7 * similarity +
                                  0.3 * (keypoint_match_count / 1000))  # Normalized keypoint matches

                if combined_score > highest_similarity:
                    highest_similarity = combined_score
                    most_similar_pdf = pdf_file

    return most_similar_pdf, highest_similarity

# Usage example
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//test3.jpg"  # Provide the path to the input image
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Provide the path to the folder with PDFs

most_similar_pdf, similarity_score = find_most_similar_pdf(input_image_path, folder_path)

if most_similar_pdf:
    print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of: {similarity_score:.4f}")
else:
    print("No similar images found in the PDFs.")



The most similar PDF is: Orwin_DL_10inch_TDS.pdf with a similarity score of: 0.5072


The results have less accuracy but now they are much better performing in terms of light category.

In [19]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
from skimage.feature import hog
import cv2

# Load the pre-trained VGG-19 and ResNet models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_intermediate_features(image, model, layer):
    activation = {}

    def hook_fn(module, input, output):
        activation['output'] = output

    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)

    with torch.no_grad():
        model(input_tensor)

    intermediate_features = activation['output']
    handle.remove()

    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    """Extract features using ResNet."""
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_image_features(image):
    """Extract combined VGG, ResNet, and HOG features."""
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    # Assign different weights to each feature set
    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    # Combine all features with weights
    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_hog_features(image):
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)

    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0

    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)

    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

def preprocess_image(image):
    image = image.resize((300, 300))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    gray_image = (gray_image * 255).astype(np.uint8)
    return gray_image

def match_keypoints(image1, image2):
    sift = cv2.SIFT_create()
    image1_gray = preprocess_image(image1)
    image2_gray = preprocess_image(image2)

    keypoints1, descriptors1 = sift.detectAndCompute(image1_gray, None)
    keypoints2, descriptors2 = sift.detectAndCompute(image2_gray, None)

    if descriptors1 is None or descriptors2 is None:
        return 0

    bf = cv2.BFMatcher()
    matches = bf.knnMatch(descriptors1, descriptors2, k=2)

    good_matches = []
    for m, n in matches:
        if m.distance < 0.75 * n.distance:
            good_matches.append(m)

    return len(good_matches)

def find_most_similar_pdf(input_image_path, folder_path):
    input_image = Image.open(input_image_path)

    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)

    most_similar_pdf = None
    highest_similarity = -1

    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            extracted_images = extract_images_from_pdf(pdf_path)

            for img in extracted_images:
                if img.mode != 'RGB':
                    img = img.convert('RGB')

                img_features = extract_image_features(img)
                similarity = calculate_image_similarity(input_image_features, img_features)
                keypoint_match_count = match_keypoints(input_image, img)

                # Adjust the weighting of similarity and keypoints for better matching
                combined_score = (0.7 * similarity + 0.3 * (keypoint_match_count / 1000))

                if combined_score > highest_similarity:
                    highest_similarity = combined_score
                    most_similar_pdf = pdf_file

    return most_similar_pdf, highest_similarity

# Usage example
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//Wall_Pack_Two.jpg"
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"

most_similar_pdf, similarity_score = find_most_similar_pdf(input_image_path, folder_path)

if most_similar_pdf:
    print(f"The most similar PDF is: {most_similar_pdf} with a similarity score of: {similarity_score:.4f}")
else:
    print("No similar images found in the PDFs.")



The most similar PDF is: Edin_WP_SC_30W40W50W60W_30K40K50K_TDS.pdf with a similarity score of: 0.2744


The image similarity finder is way better than what was used in V1.

## Adding feature of finding more than 1 similar pdf

In [3]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
from skimage.feature import hog
import cv2

# Load the pre-trained VGG-19 and ResNet models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_intermediate_features(image, model, layer):
    activation = {}

    def hook_fn(module, input, output):
        activation['output'] = output

    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)

    with torch.no_grad():
        model(input_tensor)

    intermediate_features = activation['output']
    handle.remove()

    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    """Extract features using ResNet."""
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_image_features(image):
    """Extract combined VGG, ResNet, and HOG features."""
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    # Assign different weights to each feature set
    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    # Combine all features with weights
    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_hog_features(image):
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)

    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0

    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)

    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

def preprocess_image(image):
    image = image.resize((300, 300))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    gray_image = (gray_image * 255).astype(np.uint8)
    return gray_image

def match_keypoints(image1, image2):
    sift = cv2.SIFT_create()
    image1_gray = preprocess_image(image1)
    image2_gray = preprocess_image(image2)

    keypoints1, descriptors1 = sift.detectAndCompute(image1_gray, None)
    keypoints2, descriptors2 = sift.detectAndCompute(image2_gray, None)

    if descriptors1 is None or descriptors2 is None:
        return 0

    bf = cv2.BFMatcher()
    matches = bf.knnMatch(descriptors1, descriptors2, k=2)

    good_matches = []
    for m, n in matches:
        if m.distance < 0.75 * n.distance:
            good_matches.append(m)

    return len(good_matches)

def find_top_similar_pdfs(input_image_path, folder_path, top_n=10):
    input_image = Image.open(input_image_path)

    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)

    pdf_similarity_scores = []

    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            extracted_images = extract_images_from_pdf(pdf_path)

            for img in extracted_images:
                if img.mode != 'RGB':
                    img = img.convert('RGB')

                img_features = extract_image_features(img)
                similarity = calculate_image_similarity(input_image_features, img_features)
                keypoint_match_count = match_keypoints(input_image, img)

                # Adjust the weighting of similarity and keypoints for better matching
                combined_score = (0.7 * similarity + 0.3 * (keypoint_match_count / 1000))

                pdf_similarity_scores.append((pdf_file, combined_score))

    # Sort by similarity score in descending order and return the top N results
    pdf_similarity_scores.sort(key=lambda x: x[1], reverse=True)
    top_similar_pdfs = pdf_similarity_scores[:top_n]

    return top_similar_pdfs

# Usage example
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//Capture3.jpg"
folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"

top_similar_pdfs = find_top_similar_pdfs(input_image_path, folder_path)

if top_similar_pdfs:
    for pdf, score in top_similar_pdfs:
        print(f"PDF: {pdf} with similarity score: {score:.4f}")
else:
    print("No similar images found in the PDFs.")



PDF: Coloris_RGBW_ELPL_PS.pdf with similarity score: 0.5315
PDF: Delphi_PL_2x4FT_504030W_504035K_DLC_TDS.pdf with similarity score: 0.4395


This gives us top two similar lights based on the image search.

## Backbone 2: Text Similarity

In [5]:
import PyPDF2
import re
from transformers import pipeline

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:  # Only add text if extraction was successful
                text += page_text
    return clean_text(text)

# Step 2: Clean the extracted text by removing unnecessary formatting
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text.strip()

# Step 3: Extract specific data (Lumens, Wattages, CCTs, Beam Angles) using regex
def extract_lumens_wattage_cct_beam_angles(text):
    # Patterns to match Lumens, Wattages, CCTs, and Beam Angles
    lumens_pattern = r'\b(\d{3,5})\s*Lumens?\b'  # Matches 'XXX Lumens' or 'XXXXX Lumens'
    wattage_pattern = r'\b(\d{1,3})\s*W\b'  # Matches 'X W', 'XX W', or 'XXX W'
    cct_pattern = r'\b(\d{4})\s*K\b'  # Matches 'XXXX K' (CCT in Kelvin)
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'  # Matches 'XX°' or 'XXX°' for beam angle

    # Extract data using regex
    lumens = re.findall(lumens_pattern, text)
    wattages = re.findall(wattage_pattern, text)
    ccts = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    # Format extracted data into a dictionary
    extracted_data = {
        "Lumens": lumens,
        "Wattages": wattages,
        "CCTs (K)": ccts,
        "Beam Angles (°)": beam_angles
    }

    return extracted_data

# Step 4: Summarize the extracted data
def summarize_extracted_data(extracted_data):
    summary = ""
    
    if extracted_data["Lumens"]:
        summary += f"Lumens: {', '.join(extracted_data['Lumens'])}\n"
    
    if extracted_data["Wattages"]:
        summary += f"Wattages: {', '.join(extracted_data['Wattages'])} W\n"
    
    if extracted_data["CCTs (K)"]:
        summary += f"CCTs: {', '.join(extracted_data['CCTs (K)'])} K\n"
    
    if extracted_data["Beam Angles (°)"]:
        summary += f"Beam Angles: {', '.join(extracted_data['Beam Angles (°)'])}°\n"
    
    return summary.strip()

# Step 5: Execute the Pipeline to Read and Extract Relevant Data from the PDF
pdf_path = "D://Cross Search Automation//Previous Cross//IKIO Lights//Delphi_PL_2x2FT_403020W_504035K_DLC_TDS.pdf"
pdf_text = extract_text_from_pdf(pdf_path)

# Extract Lumens, Wattages, CCTs, and Beam Angles
extracted_data = extract_lumens_wattage_cct_beam_angles(pdf_text)

# Summarize the extracted data
summary = summarize_extracted_data(extracted_data)

print(f"Extracted Summary:\n{summary}")

Extracted Summary:
Wattages: 20, 30, 40, 20, 18 W
CCTs: 3500, 4000, 5000 K


In [7]:
import PyPDF2
import re

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:  # Only add text if extraction was successful
                text += page_text
    return clean_text(text)

# Step 2: Clean the extracted text by removing unnecessary formatting
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text.strip()

# Step 3: Extract specific data (Power, Voltage, Current, Lumens, Efficacy, CCT, Beam Angle) using regex
def extract_specifications(text):
    # Patterns to match Power, Voltage, Current, Lumens, Efficacy, CCT, Beam Angles
    power_pattern = r'\b(\d{1,4})\s*W\b'  # Matches 'X W', 'XX W', or 'XXX W' for Power
    voltage_pattern = r'\b(\d{1,3})\s*V\b'  # Matches 'XX V' or 'XXX V' for Voltage
    current_pattern = r'\b(\d{1,3}\.\d{1,3})\s*A\b'  # Matches 'X.XX A' for Current
    lumens_pattern = r'\b(\d{3,6})\s*Lumens?\b'  # Matches 'XXX Lumens' or 'XXXXX Lumens'
    efficacy_pattern = r'\b(\d{2,4})\s*lm/W\b'  # Matches 'XXX lm/W' for efficacy
    cct_pattern = r'\b(\d{4})\s*K\b'  # Matches 'XXXX K' (CCT in Kelvin)
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'  # Matches 'XX°' or 'XXX°' for beam angle

    # Extract data using regex
    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    # Format extracted data into a dictionary
    extracted_data = {
        "Power (W)": power,
        "Voltage (V)": voltage,
        "Current (A)": current,
        "Lumens": lumens,
        "Efficacy (lm/W)": efficacy,
        "CCT (K)": cct,
        "Beam Angles (°)": beam_angles
    }

    return extracted_data

# Step 4: Summarize the extracted data
def summarize_extracted_data(extracted_data):
    summary = ""
    
    if extracted_data["Power (W)"]:
        summary += f"Power: {', '.join(extracted_data['Power (W)'])} W\n"
    
    if extracted_data["Voltage (V)"]:
        summary += f"Voltage: {', '.join(extracted_data['Voltage (V)'])} V\n"
    
    if extracted_data["Current (A)"]:
        summary += f"Current: {', '.join(extracted_data['Current (A)'])} A\n"
    
    if extracted_data["Lumens"]:
        summary += f"Lumens: {', '.join(extracted_data['Lumens'])}\n"
    
    if extracted_data["Efficacy (lm/W)"]:
        summary += f"Efficacy: {', '.join(extracted_data['Efficacy (lm/W)'])} lm/W\n"
    
    if extracted_data["CCT (K)"]:
        summary += f"CCT: {', '.join(extracted_data['CCT (K)'])} K\n"
    
    if extracted_data["Beam Angles (°)"]:
        summary += f"Beam Angles: {', '.join(extracted_data['Beam Angles (°)'])}°\n"
    
    return summary.strip()

# Step 5: Execute the Pipeline to Read and Extract Relevant Data from the PDF
pdf_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//configurable-cpx.pdf"
pdf_text = extract_text_from_pdf(pdf_path)

# Extract Power, Voltage, Current, Lumens, Efficacy, CCT, Beam Angles
extracted_data = extract_specifications(pdf_text)

# Summarize the extracted data
summary = summarize_extracted_data(extracted_data)

print(f"Extracted Summary:\n{summary}")

Extracted Summary:
Power: 10, 7, 10, 7, 10, 10, 10, 15, 20, 20 W
Voltage: 10, 10, 10, 277, 120, 277, 277, 347, 10, 10 V
Lumens: 2000, 3200, 4000, 5000, 3200, 2000, 3200, 4000, 5000, 3200, 4000, 3000, 4000, 5000, 6000, 7200, 8500, 10000, 4000, 5000, 2000, 120
CCT: 3000, 3500, 4000, 3500, 3500, 3500 K


In [4]:
import PyPDF2
import re

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:  # Only add text if extraction was successful
                text += page_text
    return clean_text(text)

# Step 2: Clean the extracted text by removing unnecessary formatting
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text.strip()

# Step 3: Extract specific data including Ordering Part Number, Power, Voltage, Current, etc.
def extract_specifications(text):
    # Patterns to match specifications
    ordering_part_number_pattern = r'IK-[A-Z0-9-]+-[0-9]{2,4}W-[0-9/]+K-[A-Z]+-[A-Z]+'  # Ordering Part Number
    power_pattern = r'\b(\d{1,4})\s*W\b'  # Matches 'X W', 'XX W', or 'XXX W' for Power
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'  # Matches 'XX-XXX V' or 'XX V'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'  # Matches 'X.XX A'
    lumens_pattern = r'(\d{3,6}(?:\.\d+)?)\s*Lumens?\b'  # Matches 'XXX Lumens' or 'XXXXX Lumens'
    efficacy_pattern = r'\b(\d{2,4}(?:\.\d+)?)\s*lm/W\b'  # Matches 'XXX lm/W' for efficacy
    cct_pattern = r'\b(\d{4})\s*K\b'  # Matches 'XXXX K' (CCT in Kelvin)
    beam_angle_pattern = r'(\d{1,3})\s*[°°]\b'  # Matches 'XX°' or 'XXX°' for beam angle
    finish_pattern = r'Finish:\s*([A-Za-z0-9\s]+)'  # Matches 'Finish: [value]'
    warranty_pattern = r'Warranty:\s*(\d+)\s*years?'  # Matches 'Warranty: X years'

    # Extract data using regex
    ordering_part_number = re.findall(ordering_part_number_pattern, text)
    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)
    finish = re.findall(finish_pattern, text)
    warranty = re.findall(warranty_pattern, text)

    # Process voltage to handle ranges (e.g., 120-277 V)
    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:  # Handles '120-277 V' case
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:  # Handles individual voltages like '277 V'
            voltage_ranges.append(volt[2])

    # Remove duplicates by converting lists to sets and then back to lists
    extracted_data = {
        "Ordering Part Number": sorted(set(ordering_part_number)),
        "Power (W)": sorted(set(power), key=int),  # Sort numerically and remove duplicates
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set(lumens), key=float),  # Extract the numeric part from the tuples
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int),
        "Finish": sorted(set(finish)),
        "Warranty": sorted(set(warranty), key=lambda x: int(x))  # Sort warranty years numerically
    }

    return extracted_data

# Step 4: Summarize the extracted data
def summarize_extracted_data(extracted_data):
    summary = ""
    
    if "Ordering Part Number" in extracted_data and extracted_data["Ordering Part Number"]:
        summary += f"Ordering Part Number: {', '.join(extracted_data['Ordering Part Number'])}\n"
    
    if "Power (W)" in extracted_data and extracted_data["Power (W)"]:
        summary += f"Power: {', '.join(extracted_data['Power (W)'])} W\n"
    
    if "Voltage (V)" in extracted_data and extracted_data["Voltage (V)"]:
        summary += f"Voltage: {', '.join(extracted_data['Voltage (V)'])} V\n"
    
    if "Current (A)" in extracted_data and extracted_data["Current (A)"]:
        summary += f"Current: {', '.join(extracted_data['Current (A)'])} A\n"
    
    if "Lumens" in extracted_data and extracted_data["Lumens"]:
        summary += f"Lumens: {', '.join(extracted_data['Lumens'])} lm\n"
    
    if "Efficacy (lm/W)" in extracted_data and extracted_data["Efficacy (lm/W)"]:
        summary += f"Efficacy: {', '.join(extracted_data['Efficacy (lm/W)'])} lm/W\n"
    
    if "CCT (K)" in extracted_data and extracted_data["CCT (K)"]:
        summary += f"CCT: {', '.join(extracted_data['CCT (K)'])} K\n"
    
    if "Beam Angles (°)" in extracted_data and extracted_data["Beam Angles (°)"]:
        summary += f"Beam Angle: {', '.join(extracted_data['Beam Angles (°)'])}°\n"
    
    if "Finish" in extracted_data and extracted_data["Finish"]:
        summary += f"Finish: {', '.join(extracted_data['Finish'])}\n"
    
    if "Warranty" in extracted_data and extracted_data["Warranty"]:
        summary += f"Warranty: {', '.join(extracted_data['Warranty'])} years\n"
    
    return summary.strip()

# Step 5: Process a single PDF
def process_single_pdf(pdf_path):
    print(f"Processing {pdf_path}...")
    
    pdf_text = extract_text_from_pdf(pdf_path)
    extracted_data = extract_specifications(pdf_text)
    summary = summarize_extracted_data(extracted_data)
    
    print(f"Extracted Summary for {pdf_path}:\n{summary}\n")

# Step 6: Execute the pipeline for a single PDF
pdf_path = "D://Cross Search Automation//Previous Cross//IKIO Lights//Area Luminaires//Areon_AL_Bronze_70W100W150_MV_TDS.pdf"  # Update this to your file path
process_single_pdf(pdf_path)

Processing D://Cross Search Automation//Previous Cross//IKIO Lights//Area Luminaires//Areon_AL_Bronze_70W100W150_MV_TDS.pdf...
Extracted Summary for D://Cross Search Automation//Previous Cross//IKIO Lights//Area Luminaires//Areon_AL_Bronze_70W100W150_MV_TDS.pdf:
Ordering Part Number: IK-SBSLG2-150W-30/40/50K-MV-BR, IK-SBSLG2-150W-30/40/50K-MV-D
Power: 22, 150 W
Voltage: 100-277, 120-277, 480 V
Current: 0.61 A
Lumens: 482453 lm
Efficacy: 133.98 lm/W
CCT: 3000, 4000, 5000 K



Now this is something that we can achieve for all the pdfs. I will now be implementing it for multiple pdf's and then converting it into a text similarity program.

In [15]:
import PyPDF2
import re
import os

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:  # Only add text if extraction was successful
                text += page_text
    return clean_text(text)

# Step 2: Clean the extracted text by removing unnecessary formatting
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text.strip()

# Step 3: Extract specific data (Power, Voltage, Current, Lumens, Efficacy, CCT, Beam Angle) using regex
def extract_specifications(text):
    # Patterns to match Power, Voltage, Current, Lumens, Efficacy, CCT, Beam Angles
    power_pattern = r'\b(\d{1,4})\s*W\b'  # Matches 'X W', 'XX W', or 'XXX W' for Power
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'  # Matches 'XX-XXX V' or 'XX V'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'  # Matches 'X.XX A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'  # Matches 'XXX Lumens' or 'XXXXX Lumens'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'  # Matches 'XXX lm/W' for efficacy
    cct_pattern = r'\b(\d{4})\s*K\b'  # Matches 'XXXX K' (CCT in Kelvin)
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'  # Matches 'XX°' or 'XXX°' for beam angle

    # Extract data using regex
    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    # Process voltage to handle range (e.g., 120-277 V)
    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:  # Handles '120-277 V' case
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:  # Handles individual voltages like '277 V'
            voltage_ranges.append(volt[2])

    # Remove duplicates by converting lists to sets and then back to lists
    extracted_data = {
        "Power (W)": list(set(power)),
        "Voltage (V)": list(set(voltage_ranges)),
        "Current (A)": list(set(current)),
        "Lumens": list(set([lum[0] for lum in lumens])),  # Extract the numeric part from tuples
        "Efficacy (lm/W)": list(set(efficacy)),
        "CCT (K)": list(set(cct)),
        "Beam Angles (°)": list(set(beam_angles))
    }

    return extracted_data

# Step 4: Summarize the extracted data
def summarize_extracted_data(extracted_data):
    summary = ""
    
    if extracted_data["Power (W)"]:
        summary += f"Power: {', '.join(extracted_data['Power (W)'])} W\n"
    
    if extracted_data["Voltage (V)"]:
        summary += f"Voltage: {', '.join(extracted_data['Voltage (V)'])} V\n"
    
    if extracted_data["Current (A)"]:
        summary += f"Current: {', '.join(extracted_data['Current (A)'])} A\n"
    
    if extracted_data["Lumens"]:
        summary += f"Lumens: {', '.join(extracted_data['Lumens'])} lm\n"
    
    if extracted_data["Efficacy (lm/W)"]:
        summary += f"Efficacy: {', '.join(extracted_data['Efficacy (lm/W)'])} lm/W\n"
    
    if extracted_data["CCT (K)"]:
        summary += f"CCT: {', '.join(extracted_data['CCT (K)'])} K\n"
    
    if extracted_data["Beam Angles (°)"]:
        summary += f"Beam Angle: {', '.join(extracted_data['Beam Angles (°)'])}°\n"
    
    return summary.strip()

# Step 5: Process multiple PDFs in the folder
def process_pdfs_in_folder(folder_path):
    # Get all PDF files in the folder
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    # Loop through each PDF and extract information
    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        print(f"Processing {pdf_file}...")
        
        pdf_text = extract_text_from_pdf(pdf_path)
        extracted_data = extract_specifications(pdf_text)
        summary = summarize_extracted_data(extracted_data)
        
        print(f"Extracted Summary for {pdf_file}:\n{summary}\n")

# Step 6: Execute the pipeline for all PDFs in a folder
folder_path = "D://Cross Search Automation//Previous Cross//Vendor Lights"  # Update this to your folder path
process_pdfs_in_folder(folder_path)

Processing configurable-cpx.pdf...
Extracted Summary for configurable-cpx.pdf:
Power: 20, 15, 7, 10 W
Voltage: 120-277, 347, 10, 277, 120 V
Lumens: 2000, 10000, 4000, 7200, 3200, 5000, 3000, 120, 6000, 8500 lm
CCT: 3000, 4000, 3500 K

Processing lbk-configurable.pdf...
Extracted Summary for lbk-configurable.pdf:
Power: 3, 13, 77, 0, 1, 4, 5, 15, 20, 9, 10, 7, 2, 6, 51, 39, 26 W
Voltage: 120-277, 347, 10 V
Lumens: 120 lm
CCT: 5000, 4000, 3500 K



Now I am converting the above code to do the same for IKIO Lights folder and Vendor Lights folder and provide the results for all the pdf's in the same format. Which will act as a backbone for the next step which would be comparing and finding similarity aongst the pdf's of the two folders.

In [17]:
import PyPDF2
import re
import os

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:  # Only add text if extraction was successful
                text += page_text
    return clean_text(text)

# Step 2: Clean the extracted text by removing unnecessary formatting
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text.strip()

# Step 3: Extract specific data (Power, Voltage, Current, Lumens, Efficacy, CCT, Beam Angle) using regex
def extract_specifications(text):
    # Patterns to match Power, Voltage, Current, Lumens, Efficacy, CCT, Beam Angles
    power_pattern = r'\b(\d{1,4})\s*W\b'
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'

    # Extract data using regex
    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    # Process voltage to handle range (e.g., 120-277 V)
    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:
            voltage_ranges.append(volt[2])

    # Remove duplicates by converting lists to sets and then back to lists
    extracted_data = {
        "Power (W)": list(set(power)),
        "Voltage (V)": list(set(voltage_ranges)),
        "Current (A)": list(set(current)),
        "Lumens": list(set([lum[0] for lum in lumens])),
        "Efficacy (lm/W)": list(set(efficacy)),
        "CCT (K)": list(set(cct)),
        "Beam Angles (°)": list(set(beam_angles))
    }

    return extracted_data

# Step 4: Summarize the extracted data
def summarize_extracted_data(extracted_data):
    summary = ""
    
    if extracted_data["Power (W)"]:
        summary += f"Power: {', '.join(extracted_data['Power (W)'])} W\n"
    
    if extracted_data["Voltage (V)"]:
        summary += f"Voltage: {', '.join(extracted_data['Voltage (V)'])} V\n"
    
    if extracted_data["Current (A)"]:
        summary += f"Current: {', '.join(extracted_data['Current (A)'])} A\n"
    
    if extracted_data["Lumens"]:
        summary += f"Lumens: {', '.join(extracted_data['Lumens'])} lm\n"
    
    if extracted_data["Efficacy (lm/W)"]:
        summary += f"Efficacy: {', '.join(extracted_data['Efficacy (lm/W)'])} lm/W\n"
    
    if extracted_data["CCT (K)"]:
        summary += f"CCT: {', '.join(extracted_data['CCT (K)'])} K\n"
    
    if extracted_data["Beam Angles (°)"]:
        summary += f"Beam Angle: {', '.join(extracted_data['Beam Angles (°)'])}°\n"
    
    return summary.strip()

# Step 5: Process PDFs in a specific folder
def process_pdfs_in_folder(folder_path, folder_name):
    print(f"PDFs from {folder_name} folder:")
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        print(f"\nProcessing {pdf_file}...")
        
        pdf_text = extract_text_from_pdf(pdf_path)
        extracted_data = extract_specifications(pdf_text)
        summary = summarize_extracted_data(extracted_data)
        
        print(f"Extracted Summary for {pdf_file}:\n{summary}\n")

# Step 6: Execute the pipeline for both folders
ikio_lights_folder = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Update this to your IKIO folder path
vendor_lights_folder = "D://Cross Search Automation//Previous Cross//Vendor Lights"  # Update this to your Vendor folder path

# Process PDFs in both folders
process_pdfs_in_folder(ikio_lights_folder, "IKIO Lights")
process_pdfs_in_folder(vendor_lights_folder, "Vendor Lights")

PDFs from IKIO Lights folder:

Processing 1 strip Aimant_MRSK_24W_4000K_120-277V_PC_lens_1strip_TDS.pdf...
Extracted Summary for 1 strip Aimant_MRSK_24W_4000K_120-277V_PC_lens_1strip_TDS.pdf:
Power: 24, 8 W
Voltage: 120-277, 10 V
Current: 0.10 A
Efficacy: 130 lm/W
CCT: 4000 K


Processing 1 strip Aimant_MRSK_24W_5000K_120-277V_PC_lens_1strip_TDS.pdf...
Extracted Summary for 1 strip Aimant_MRSK_24W_5000K_120-277V_PC_lens_1strip_TDS.pdf:
Power: 24, 8 W
Voltage: 120-277, 10 V
Current: 0.10 A
Efficacy: 130 lm/W
CCT: 5000 K


Processing Ace_CB_36W45W54W_TDS.pdf...
Extracted Summary for Ace_CB_36W45W54W_TDS.pdf:
Power: 54 W
Voltage: 120-277 V
Current: 0.22 A
Lumens: 482439 lm
Efficacy: 135.35 lm/W
CCT: 5000, 3000, 4000 K


Processing Ace_CB_80W100W120W_TDS.pdf...
Extracted Summary for Ace_CB_80W100W120W_TDS.pdf:
Power: 120, 33 W
Voltage: 120-277 V
Current: 0.22 A
Efficacy: 134.33 lm/W
CCT: 5000, 3000, 4000 K


Processing Aimant_MRSK_24W_4000K_120-277V_PC_lens_2strip_TDS.pdf...
Extracted Summ

Now we are able to extract the exact data from the pdf's of the specific mentioned parameters. We will now integrate this with the image similarity to give the data of the 2 pdf's after the image similarity is done.

## Final Output

In [27]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
from skimage.feature import hog
import cv2
import PyPDF2
import re

# Load the pre-trained VGG-19 and ResNet models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Program 1: Image Similarity Functions
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_intermediate_features(image, model, layer):
    activation = {}

    def hook_fn(module, input, output):
        activation['output'] = output

    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)

    with torch.no_grad():
        model(input_tensor)

    intermediate_features = activation['output']
    handle.remove()

    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_image_features(image):
    """Extract combined VGG, ResNet, and HOG features."""
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_hog_features(image):
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)

    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0

    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)

    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

def preprocess_image(image):
    image = image.resize((300, 300))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    gray_image = (gray_image * 255).astype(np.uint8)
    return gray_image

def match_keypoints(image1, image2):
    sift = cv2.SIFT_create()
    image1_gray = preprocess_image(image1)
    image2_gray = preprocess_image(image2)

    keypoints1, descriptors1 = sift.detectAndCompute(image1_gray, None)
    keypoints2, descriptors2 = sift.detectAndCompute(image2_gray, None)

    if descriptors1 is None or descriptors2 is None:
        return 0

    bf = cv2.BFMatcher()
    matches = bf.knnMatch(descriptors1, descriptors2, k=2)

    good_matches = []
    for m, n in matches:
        if m.distance < 0.75 * n.distance:
            good_matches.append(m)

    return len(good_matches)

def find_top_similar_pdfs(input_image_path, folder_path, top_n=2):
    input_image = Image.open(input_image_path)

    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)

    pdf_similarity_scores = []

    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            extracted_images = extract_images_from_pdf(pdf_path)

            for img in extracted_images:
                if img.mode != 'RGB':
                    img = img.convert('RGB')

                img_features = extract_image_features(img)
                similarity = calculate_image_similarity(input_image_features, img_features)
                keypoint_match_count = match_keypoints(input_image, img)

                combined_score = (0.7 * similarity + 0.3 * (keypoint_match_count / 1000))

                pdf_similarity_scores.append((pdf_file, combined_score))

    pdf_similarity_scores.sort(key=lambda x: x[1], reverse=True)
    top_similar_pdfs = pdf_similarity_scores[:top_n]

    return top_similar_pdfs

# Program 2: PDF Data Extraction Functions
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return clean_text(text)

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'

    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:
            voltage_ranges.append(volt[2])

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int)
    }

    return extracted_data

def summarize_extracted_data(extracted_data):
    summary = ""
    
    if extracted_data["Power (W)"]:
        summary += f"Power: {', '.join(extracted_data['Power (W)'])} W\n"
    
    if extracted_data["Voltage (V)"]:
        summary += f"Voltage: {', '.join(extracted_data['Voltage (V)'])} V\n"
    
    if extracted_data["Current (A)"]:
        summary += f"Current: {', '.join(extracted_data['Current (A)'])} A\n"
    
    if extracted_data["Lumens"]:
        summary += f"Lumens: {', '.join(extracted_data['Lumens'])} lm\n"
    
    if extracted_data["Efficacy (lm/W)"]:
        summary += f"Efficacy: {', '.join(extracted_data['Efficacy (lm/W)'])} lm/W\n"
    
    if extracted_data["CCT (K)"]:
        summary += f"CCT: {', '.join(extracted_data['CCT (K)'])} K\n"
    
    if extracted_data["Beam Angles (°)"]:
        summary += f"Beam Angle: {', '.join(extracted_data['Beam Angles (°)'])}°\n"
    
    return summary.strip()

# Main Function
def main(input_image_path, pdf_folder_path):
    # Step 1: Find the most similar PDFs
    print("Finding the most similar PDFs based on image similarity...\n")
    top_similar_pdfs = find_top_similar_pdfs(input_image_path, pdf_folder_path)
    
    print("Top similar PDFs:")
    for pdf_file, score in top_similar_pdfs:
        print(f"{pdf_file}: Similarity Score = {score:.4f}")
    
    print("\nExtracting and summarizing data from PDFs...\n")

    # Step 2: Extract and summarize specifications from PDFs
    for pdf_file, _ in top_similar_pdfs:
        pdf_path = os.path.join(pdf_folder_path, pdf_file)
        pdf_text = extract_text_from_pdf(pdf_path)
        extracted_data = extract_specifications(pdf_text)
        summary = summarize_extracted_data(extracted_data)

        print(f"PDF: {pdf_file}\n{summary}\n{'-'*50}")

# Example usage
input_image_path = "D://Cross Search Automation//Previous Cross//Vendor Lights//test3.JPG"  # Your input image path
pdf_folder_path = "D://Cross Search Automation//Previous Cross//IKIO Lights"  # Your folder path containing PDFs
main(input_image_path, pdf_folder_path)



Finding the most similar PDFs based on image similarity...

Top similar PDFs:
Orwin_DL_10inch_TDS.pdf: Similarity Score = 0.5200
Orwin_DL_6inch_TDS.pdf: Similarity Score = 0.5200

Extracting and summarizing data from PDFs...

PDF: Orwin_DL_10inch_TDS.pdf
Power: 18, 22, 30, 38 W
Voltage: 10, 120-277 V
Current: 0.14 A
Efficacy: 89.11, 91.50, 93.83 lm/W
CCT: 2700, 3000, 3500, 4000, 5000 K
--------------------------------------------------
PDF: Orwin_DL_6inch_TDS.pdf
Power: 12, 17, 18, 22 W
Voltage: 10, 120-277 V
Current: 0.08 A
Efficacy: 84.68, 87.50, 90.24 lm/W
CCT: 2700, 3000, 3500, 4000, 5000 K
--------------------------------------------------


## UI

In [1]:
import os
import io
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.color import rgb2gray
from skimage.feature import hog
import cv2
import PyPDF2
import re
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from threading import Thread

# Load the pre-trained VGG-19 and ResNet models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Program 1: Image Similarity Functions
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            images.append(image)

    doc.close()
    return images

def extract_intermediate_features(image, model, layer):
    activation = {}

    def hook_fn(module, input, output):
        activation['output'] = output

    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)

    with torch.no_grad():
        model(input_tensor)

    intermediate_features = activation['output']
    handle.remove()

    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_image_features(image):
    """Extract combined VGG, ResNet, and HOG features."""
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_hog_features(image):
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)

    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0

    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)

    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

def preprocess_image(image):
    image = image.resize((300, 300))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    gray_image = (gray_image * 255).astype(np.uint8)
    return gray_image

def match_keypoints(image1, image2):
    sift = cv2.SIFT_create()
    image1_gray = preprocess_image(image1)
    image2_gray = preprocess_image(image2)

    keypoints1, descriptors1 = sift.detectAndCompute(image1_gray, None)
    keypoints2, descriptors2 = sift.detectAndCompute(image2_gray, None)

    if descriptors1 is None or descriptors2 is None:
        return 0

    bf = cv2.BFMatcher()
    matches = bf.knnMatch(descriptors1, descriptors2, k=2)

    good_matches = []
    for m, n in matches:
        if m.distance < 0.75 * n.distance:
            good_matches.append(m)

    return len(good_matches)

def find_top_similar_pdfs(input_image_path, folder_path, update_progress=None, top_n=2):
    input_image = Image.open(input_image_path)

    if input_image.mode != 'RGB':
        input_image = input_image.convert('RGB')

    input_image_features = extract_image_features(input_image)

    pdf_similarity_scores = []

    pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
    total_pdfs = len(pdf_files)

    for idx, pdf_file in enumerate(pdf_files):
        pdf_path = os.path.join(folder_path, pdf_file)
        extracted_images = extract_images_from_pdf(pdf_path)

        for img in extracted_images:
            if img.mode != 'RGB':
                img = img.convert('RGB')

            img_features = extract_image_features(img)
            similarity = calculate_image_similarity(input_image_features, img_features)
            keypoint_match_count = match_keypoints(input_image, img)

            combined_score = (0.7 * similarity + 0.3 * (keypoint_match_count / 1000))

            pdf_similarity_scores.append((pdf_file, combined_score))

        if update_progress:
            update_progress(idx + 1, total_pdfs)

    pdf_similarity_scores.sort(key=lambda x: x[1], reverse=True)
    top_similar_pdfs = pdf_similarity_scores[:top_n]

    return top_similar_pdfs

# Program 2: PDF Data Extraction Functions
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return clean_text(text)

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'

    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:
            voltage_ranges.append(volt[2])

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int)
    }

    return extracted_data

def summarize_extracted_data(extracted_data):
    summary = ""
    
    if extracted_data["Power (W)"]:
        summary += f"Power: {', '.join(extracted_data['Power (W)'])} W\n"
    
    if extracted_data["Voltage (V)"]:
        summary += f"Voltage: {', '.join(extracted_data['Voltage (V)'])} V\n"
    
    if extracted_data["Current (A)"]:
        summary += f"Current: {', '.join(extracted_data['Current (A)'])} A\n"
    
    if extracted_data["Lumens"]:
        summary += f"Lumens: {', '.join(extracted_data['Lumens'])} lm\n"
    
    if extracted_data["Efficacy (lm/W)"]:
        summary += f"Efficacy: {', '.join(extracted_data['Efficacy (lm/W)'])} lm/W\n"
    
    if extracted_data["CCT (K)"]:
        summary += f"CCT: {', '.join(extracted_data['CCT (K)'])} K\n"
    
    if extracted_data["Beam Angles (°)"]:
        summary += f"Beam Angle: {', '.join(extracted_data['Beam Angles (°)'])}°\n"
    
    return summary.strip()

# UI code
class PDFComparisonApp:
    def __init__(self, root):
        self.root = root
        self.root.title("PDF Comparison Tool")

        # Create UI elements
        self.label_image_path = tk.Label(root, text="Input Image Path:")
        self.label_image_path.grid(row=0, column=0, padx=10, pady=5)
        self.entry_image_path = tk.Entry(root, width=50)
        self.entry_image_path.grid(row=0, column=1, padx=10, pady=5)
        self.button_browse_image = tk.Button(root, text="Browse", command=self.browse_image)
        self.button_browse_image.grid(row=0, column=2, padx=10, pady=5)

        self.label_pdf_folder_path = tk.Label(root, text="PDF Folder Path:")
        self.label_pdf_folder_path.grid(row=1, column=0, padx=10, pady=5)
        self.entry_pdf_folder_path = tk.Entry(root, width=50)
        self.entry_pdf_folder_path.grid(row=1, column=1, padx=10, pady=5)
        self.button_browse_pdf_folder = tk.Button(root, text="Browse", command=self.browse_pdf_folder)
        self.button_browse_pdf_folder.grid(row=1, column=2, padx=10, pady=5)

        self.label_compare_pdf_path = tk.Label(root, text="PDF to Compare:")
        self.label_compare_pdf_path.grid(row=2, column=0, padx=10, pady=5)
        self.entry_compare_pdf_path = tk.Entry(root, width=50)
        self.entry_compare_pdf_path.grid(row=2, column=1, padx=10, pady=5)
        self.button_browse_compare_pdf = tk.Button(root, text="Browse", command=self.browse_compare_pdf)
        self.button_browse_compare_pdf.grid(row=2, column=2, padx=10, pady=5)

        self.button_run = tk.Button(root, text="Run", command=self.run)
        self.button_run.grid(row=3, column=0, columnspan=3, padx=10, pady=10)

        self.progress = tk.DoubleVar()
        self.progress_bar = ttk.Progressbar(root, length=400, variable=self.progress, maximum=100)
        self.progress_bar.grid(row=4, column=0, columnspan=3, padx=10, pady=10)

        self.label_progress = tk.Label(root, text="Progress:")
        self.label_progress.grid(row=5, column=0, padx=10, pady=5)
        self.label_percentage = tk.Label(root, text="0%")
        self.label_percentage.grid(row=5, column=1, padx=10, pady=5)

        self.result_text = tk.Text(root, height=20, width=80)
        self.result_text.grid(row=6, column=0, columnspan=3, padx=10, pady=10)

    def browse_image(self):
        file_path = filedialog.askopenfilename(filetypes=[("Image Files", "*.jpg;*.jpeg;*.png")])
        if file_path:
            self.entry_image_path.delete(0, tk.END)
            self.entry_image_path.insert(0, file_path)

    def browse_pdf_folder(self):
        folder_path = filedialog.askdirectory()
        if folder_path:
            self.entry_pdf_folder_path.delete(0, tk.END)
            self.entry_pdf_folder_path.insert(0, folder_path)

    def browse_compare_pdf(self):
        file_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf")])
        if file_path:
            self.entry_compare_pdf_path.delete(0, tk.END)
            self.entry_compare_pdf_path.insert(0, file_path)

    def run(self):
        image_path = self.entry_image_path.get()
        pdf_folder_path = self.entry_pdf_folder_path.get()
        compare_pdf_path = self.entry_compare_pdf_path.get()

        if not os.path.isfile(image_path) or not os.path.isdir(pdf_folder_path) or not os.path.isfile(compare_pdf_path):
            messagebox.showerror("Error", "Please provide valid paths for all inputs.")
            return

        self.result_text.delete(1.0, tk.END)
        self.progress.set(0)
        self.label_percentage.config(text="0%")

        def update_progress(current, total):
            percentage = (current / total) * 100
            self.progress.set(percentage)
            self.label_percentage.config(text=f"{int(percentage)}%")
            self.root.update_idletasks()

        def run_task():
            try:
                top_similar_pdfs = find_top_similar_pdfs(image_path, pdf_folder_path, update_progress)

                self.result_text.insert(tk.END, "Top similar PDFs:\n")
                for pdf_file, score in top_similar_pdfs:
                    self.result_text.insert(tk.END, f"{pdf_file}: Similarity Score = {score:.4f}\n")

                pdf_summaries = {}
                for pdf_file, _ in top_similar_pdfs:
                    pdf_path = os.path.join(pdf_folder_path, pdf_file)
                    pdf_text = extract_text_from_pdf(pdf_path)
                    extracted_data = extract_specifications(pdf_text)
                    summary = summarize_extracted_data(extracted_data)
                    pdf_summaries[pdf_file] = summary

                self.result_text.insert(tk.END, "\nSummaries of shortlisted PDFs:\n")
                for pdf_file, summary in pdf_summaries.items():
                    self.result_text.insert(tk.END, f"PDF: {pdf_file}\n{summary}\n{'-'*50}\n")

                user_pdf_text = extract_text_from_pdf(compare_pdf_path)
                user_extracted_data = extract_specifications(user_pdf_text)
                user_summary = summarize_extracted_data(user_extracted_data)

                self.result_text.insert(tk.END, f"\nUser-provided PDF Summary:\n{user_summary}\n{'-'*50}\n")

                self.result_text.insert(tk.END, "\nComparing user PDF with shortlisted PDFs:\n")
                for pdf_file, summary in pdf_summaries.items():
                    self.result_text.insert(tk.END, f"Comparing with PDF: {pdf_file}\n")
                    for key, value in user_extracted_data.items():
                        if key in extracted_data:
                            common_values = set(value).intersection(set(extracted_data[key]))
                            if common_values:
                                self.result_text.insert(tk.END, f"Common {key}: {', '.join(common_values)}\n")
                            else:
                                self.result_text.insert(tk.END, f"No common {key} found.\n")

                self.result_text.insert(tk.END, "\nProcessing complete.")

            except Exception as e:
                messagebox.showerror("Error", str(e))

        thread = Thread(target=run_task)
        thread.start()

if __name__ == "__main__":
    root = tk.Tk()
    app = PDFComparisonApp(root)
    root.mainloop()

