## Main

In [6]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'
    ordering_part_number_pattern = r'\b(?:OPN|Ordering Part Number|Part Number|Order Number|Item Number)\s*:\s*([a-zA-Z0-9\-]+)\b'

    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)
    ordering_part_numbers = re.findall(ordering_part_number_pattern, text)

    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:
            voltage_ranges.append(volt[2])

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int),
        "Ordering Part Number": sorted(set(ordering_part_numbers), key=lambda x: x)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

# Calculate similarity scores
def calculate_similarity(input_data, pdf_data):
    score = 0
    if any(power in pdf_data["Power (W)"] for power in input_data["Power (W)"]):
        score += 1
    if any(voltage in pdf_data["Voltage (V)"] for voltage in input_data["Voltage (V)"]):
        score += 1
    if any(current in pdf_data["Current (A)"] for current in input_data["Current (A)"]):
        score += 1
    if any(efficacy in pdf_data["Efficacy (lm/W)"] for efficacy in input_data["Efficacy (lm/W)"]):
        score += 1
    if any(cct in pdf_data["CCT (K)"] for cct in input_data["CCT (K)"]):
        score += 1
    if any(lumen in pdf_data["Lumens"] for lumen in input_data["Lumens"]):
        score += 1
    if any(beam_angle in pdf_data["Beam Angles (°)"] for beam_angle in input_data["Beam Angles (°)"]):
        score += 1
    if any(opn in pdf_data["Ordering Part Number"] for opn in input_data["Ordering Part Number"]):
        score += 1
    
    # Fuzzy matching for more accuracy
    def fuzzy_match(input_list, pdf_list):
        match_score = 0
        for item in input_list:
            best_match = process.extractOne(item, pdf_list, scorer=fuzz.token_sort_ratio)
            if best_match and best_match[1] > 80:  # threshold for a match
                match_score += 1
        return match_score

    # Add fuzzy matching scores
    score += fuzzy_match(input_data["Power (W)"], pdf_data["Power (W)"])
    score += fuzzy_match(input_data["Voltage (V)"], pdf_data["Voltage (V)"])
    score += fuzzy_match(input_data["Current (A)"], pdf_data["Current (A)"])
    score += fuzzy_match(input_data["Efficacy (lm/W)"], pdf_data["Efficacy (lm/W)"])
    score += fuzzy_match(input_data["CCT (K)"], pdf_data["CCT (K)"])
    score += fuzzy_match(input_data["Lumens"], pdf_data["Lumens"])
    score += fuzzy_match(input_data["Beam Angles (°)"], pdf_data["Beam Angles (°)"])
    score += fuzzy_match(input_data["Ordering Part Number"], pdf_data["Ordering Part Number"])

    return score

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    pdf_scores = []
    progress_bar['maximum'] = 0  # Reset progress bar maximum

    start_time = time.time()

    # Use os.walk to traverse through all subdirectories
    for root, _, files in os.walk(folder_path):
        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)  # Update progress bar maximum

        for pdf_file in pdf_files:
            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            if input_image_path:
                images = extract_images_from_pdf(pdf_file)
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            pdf_scores.append((pdf_file, combined_score))
            progress_bar['value'] += 1
            # Calculate elapsed time and ETA
            elapsed_time = time.time() - start_time
            remaining_pdfs = len(pdf_scores) - (progress_bar['value'])
            if progress_bar['value'] > 0:  # Avoid division by zero
                estimated_time_per_pdf = elapsed_time / progress_bar['value']
                eta = remaining_pdfs * estimated_time_per_pdf
                progress_label.config(text=f"Progress: {progress_bar['value']}/{progress_bar['maximum']}")

            window.update()

    # Sort and get top 10 results
    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    # Reset progress bar and label
    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)
    for i, (pdf_path, score) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        # Add hyperlink functionality for each PDF path
        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")
        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Voltage (V)": [voltage_entry.get()],
        "Current (A)": [current_entry.get()],
        "Efficacy (lm/W)": [efficacy_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Lumens": [lumens_entry.get()],
        "Beam Angles (°)": [beam_angle_entry.get()],
        "Ordering Part Number": [ordering_part_number_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = tk.Tk()
window.title("PDF Similarity Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Title Label for "Cross Validation Tool"
title_label_cv = tk.Label(window, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=2, pady=20, sticky="w")  # Align to the west (left)

# Folder selection
folder_label = tk.Label(window, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=1, column=0, padx=10, pady=10, sticky=tk.W)
folder_entry = tk.Entry(window, width=50, font=entry_font)
folder_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(window, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=1, column=2, padx=10, pady=10)

# Image selection
image_label = tk.Label(window, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=2, column=0, padx=10, pady=10, sticky=tk.W)
image_entry = tk.Entry(window, width=50, font=entry_font)
image_entry.grid(row=2, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(window, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=2, column=2, padx=10, pady=10)

# Input fields for specifications
labels = ["Power (W):", "Voltage (V):", "Current (A):", "Efficacy (lm/W):", "CCT (K):", "Lumens (lm):", "Beam Angle (°):", "Ordering Part Number:"]
entries = []

for i, label_text in enumerate(labels):
    label = tk.Label(window, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i + 3, column=0, padx=10, pady=5, sticky=tk.W)
    entry = tk.Entry(window, width=20, font=entry_font)  # Reduced the width from 50 to 20
    entry.grid(row=i + 3, column=1, padx=10, pady=5, sticky="ew")
    entries.append(entry)

power_entry, voltage_entry, current_entry, efficacy_entry, cct_entry, lumens_entry, beam_angle_entry, ordering_part_number_entry = entries

# Create a frame for the progress bar and label
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=14, column=0, columnspan=3, padx=10, pady=10, sticky="ew")  # Place it below other elements

# Progress bar inside the frame
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=500, mode="determinate")  # Increased length
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)  # Align left

# Progress label inside the frame
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)  # Align left next to the progress bar

# Result box (scrollable) below the image
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=4, column=3, rowspan=7, padx=10, pady=10, sticky="nsew")
result_frame.grid_columnconfigure(0, weight=1)  # Allow the frame to expand with the window

result_box = Text(result_frame, height=15, width=80, font=entry_font, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

scrollbar = ttk.Scrollbar(result_frame, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Submit button (Find Top 10 PDFs)
submit_button = tk.Button(window, text="Find Top 10 PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=20)  # Set a reduced width for the button
submit_button.grid(row=13, column=3, padx=10, pady=10, sticky="ew")

# Add "Clear Sections" button
clear_button = tk.Button(window, text="Clear Sections", command=lambda: clear_sections(), **button_style)
clear_button.config(width=20)  # Set a reduced width for the button
clear_button.grid(row=14, column=3, padx=10, pady=10, sticky="ew")

# Image display (right side of the window)
image_display_label = tk.Label(window, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=1, column=3, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the right side
image_canvas = tk.Label(window, bg="#f2f2f2")
image_canvas.grid(row=2, column=3, padx=10, pady=(0, 20), sticky=tk.NW)  # Ensure it's right under the label

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in entries:
        entry.delete(0, tk.END)
    
    # Clear the folder and image entries
    folder_entry.delete(0, tk.END)
    image_entry.delete(0, tk.END)
    
    # Reset the image display
    image_canvas.config(image='')  # Clear the image on the right side
    
    # Optionally clear the result box as well
    result_box.delete(1.0, tk.END)

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Function to display the image on the right side
def display_image(image_path):
    # Open the image file
    img = Image.open(image_path)
    
    # Define the maximum width and height for the image
    max_width = 250  # You can adjust this value as needed
    max_height = 200  # You can adjust this value as needed
    
    # Resize the image while maintaining the aspect ratio
    img.thumbnail((max_width, max_height))  # Resize within the specified dimensions
    
    # Convert the image to a format Tkinter can handle
    img_tk = ImageTk.PhotoImage(img)
    
    # Update the image on the canvas
    image_canvas.config(image=img_tk)
    image_canvas.image = img_tk  # Store a reference to avoid garbage collection

# Run the GUI loop
window.mainloop()



## Updated progress bar

In [1]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'
    ordering_part_number_pattern = r'\b(?:OPN|Ordering Part Number|Part Number|Order Number|Item Number)\s*:\s*([a-zA-Z0-9\-]+)\b'

    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)
    ordering_part_numbers = re.findall(ordering_part_number_pattern, text)

    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:
            voltage_ranges.append(volt[2])

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int),
        "Ordering Part Number": sorted(set(ordering_part_numbers), key=lambda x: x)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

# Calculate similarity scores
def calculate_similarity(input_data, pdf_data):
    score = 0
    if any(power in pdf_data["Power (W)"] for power in input_data["Power (W)"]):
        score += 1
    if any(voltage in pdf_data["Voltage (V)"] for voltage in input_data["Voltage (V)"]):
        score += 1
    if any(current in pdf_data["Current (A)"] for current in input_data["Current (A)"]):
        score += 1
    if any(efficacy in pdf_data["Efficacy (lm/W)"] for efficacy in input_data["Efficacy (lm/W)"]):
        score += 1
    if any(cct in pdf_data["CCT (K)"] for cct in input_data["CCT (K)"]):
        score += 1
    if any(lumen in pdf_data["Lumens"] for lumen in input_data["Lumens"]):
        score += 1
    if any(beam_angle in pdf_data["Beam Angles (°)"] for beam_angle in input_data["Beam Angles (°)"]):
        score += 1
    if any(opn in pdf_data["Ordering Part Number"] for opn in input_data["Ordering Part Number"]):
        score += 1
    
    # Fuzzy matching for more accuracy
    def fuzzy_match(input_list, pdf_list):
        match_score = 0
        for item in input_list:
            best_match = process.extractOne(item, pdf_list, scorer=fuzz.token_sort_ratio)
            if best_match and best_match[1] > 80:  # threshold for a match
                match_score += 1
        return match_score

    # Add fuzzy matching scores
    score += fuzzy_match(input_data["Power (W)"], pdf_data["Power (W)"])
    score += fuzzy_match(input_data["Voltage (V)"], pdf_data["Voltage (V)"])
    score += fuzzy_match(input_data["Current (A)"], pdf_data["Current (A)"])
    score += fuzzy_match(input_data["Efficacy (lm/W)"], pdf_data["Efficacy (lm/W)"])
    score += fuzzy_match(input_data["CCT (K)"], pdf_data["CCT (K)"])
    score += fuzzy_match(input_data["Lumens"], pdf_data["Lumens"])
    score += fuzzy_match(input_data["Beam Angles (°)"], pdf_data["Beam Angles (°)"])
    score += fuzzy_match(input_data["Ordering Part Number"], pdf_data["Ordering Part Number"])

    return score

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Find Top 10 similar PDFs with improved progress bar
def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    pdf_scores = []
    progress_bar['maximum'] = 0  # Reset progress bar maximum

    start_time = time.time()

    # Use os.walk to traverse through all subdirectories
    for root, _, files in os.walk(folder_path):
        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)  # Update progress bar maximum

        for pdf_file in pdf_files:
            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            if input_image_path:
                images = extract_images_from_pdf(pdf_file)
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            pdf_scores.append((pdf_file, combined_score))
            progress_bar['value'] += 1

            # Calculate elapsed time and ETA
            elapsed_time = time.time() - start_time
            processed_pdfs = progress_bar['value']
            total_pdfs = progress_bar['maximum']
            
            # Avoid division by zero
            if processed_pdfs > 0:  
                estimated_time_per_pdf = elapsed_time / processed_pdfs
                remaining_pdfs = total_pdfs - processed_pdfs
                eta = remaining_pdfs * estimated_time_per_pdf

                # Format time to show minutes and seconds
                eta_minutes, eta_seconds = divmod(eta, 60)
                eta_formatted = f"{int(eta_minutes)} min {int(eta_seconds)} sec"

                # Update progress label with percentage and ETA
                percentage = (processed_pdfs / total_pdfs) * 100
                progress_label.config(text=f"Progress: {processed_pdfs}/{total_pdfs} ({percentage:.2f}%) | ETA: {eta_formatted}")
            
            window.update()

    # Sort and get top 10 results
    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    # Reset progress bar and label
    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)
    for i, (pdf_path, score) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        # Add hyperlink functionality for each PDF path
        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")
        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Voltage (V)": [voltage_entry.get()],
        "Current (A)": [current_entry.get()],
        "Efficacy (lm/W)": [efficacy_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Lumens": [lumens_entry.get()],
        "Beam Angles (°)": [beam_angle_entry.get()],
        "Ordering Part Number": [ordering_part_number_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = tk.Tk()
window.title("PDF Similarity Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Title Label for "Cross Validation Tool"
title_label_cv = tk.Label(window, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=2, pady=20, sticky="w")  # Align to the west (left)

# Folder selection
folder_label = tk.Label(window, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=1, column=0, padx=10, pady=10, sticky=tk.W)
folder_entry = tk.Entry(window, width=50, font=entry_font)
folder_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(window, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=1, column=2, padx=10, pady=10)

# Image selection
image_label = tk.Label(window, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=2, column=0, padx=10, pady=10, sticky=tk.W)
image_entry = tk.Entry(window, width=50, font=entry_font)
image_entry.grid(row=2, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(window, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=2, column=2, padx=10, pady=10)

# Input fields for specifications
labels = ["Power (W):", "Voltage (V):", "Current (A):", "Efficacy (lm/W):", "CCT (K):", "Lumens (lm):", "Beam Angle (°):", "Ordering Part Number:"]
entries = []

for i, label_text in enumerate(labels):
    label = tk.Label(window, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i + 3, column=0, padx=10, pady=5, sticky=tk.W)
    entry = tk.Entry(window, width=20, font=entry_font)  # Reduced the width from 50 to 20
    entry.grid(row=i + 3, column=1, padx=10, pady=5, sticky="ew")
    entries.append(entry)

power_entry, voltage_entry, current_entry, efficacy_entry, cct_entry, lumens_entry, beam_angle_entry, ordering_part_number_entry = entries

# Create a frame for the progress bar and label
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=14, column=0, columnspan=3, padx=10, pady=10, sticky="ew")  # Place it below other elements

# Progress bar inside the frame
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=500, mode="determinate")  # Increased length
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)  # Align left

# Progress label inside the frame
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)  # Align left next to the progress bar

# Result box (scrollable) below the image
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=4, column=3, rowspan=7, padx=10, pady=10, sticky="nsew")
result_frame.grid_columnconfigure(0, weight=1)  # Allow the frame to expand with the window

result_box = Text(result_frame, height=15, width=80, font=entry_font, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

scrollbar = ttk.Scrollbar(result_frame, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Submit button (Find Top 10 PDFs)
submit_button = tk.Button(window, text="Find Top 10 PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=20)  # Set a reduced width for the button
submit_button.grid(row=13, column=3, padx=10, pady=10, sticky="ew")

# Add "Clear Sections" button
clear_button = tk.Button(window, text="Clear Sections", command=lambda: clear_sections(), **button_style)
clear_button.config(width=20)  # Set a reduced width for the button
clear_button.grid(row=14, column=3, padx=10, pady=10, sticky="ew")

# Image display (right side of the window)
image_display_label = tk.Label(window, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=1, column=3, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the right side
image_canvas = tk.Label(window, bg="#f2f2f2")
image_canvas.grid(row=2, column=3, padx=10, pady=(0, 20), sticky=tk.NW)  # Ensure it's right under the label

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in entries:
        entry.delete(0, tk.END)
    
    # Clear the folder and image entries
    folder_entry.delete(0, tk.END)
    image_entry.delete(0, tk.END)
    
    # Reset the image display
    image_canvas.config(image='')  # Clear the image on the right side
    
    # Optionally clear the result box as well
    result_box.delete(1.0, tk.END)

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Function to display the image on the right side
def display_image(image_path):
    # Open the image file
    img = Image.open(image_path)
    
    # Define the maximum width and height for the image
    max_width = 250  # You can adjust this value as needed
    max_height = 200  # You can adjust this value as needed
    
    # Resize the image while maintaining the aspect ratio
    img.thumbnail((max_width, max_height))  # Resize within the specified dimensions
    
    # Convert the image to a format Tkinter can handle
    img_tk = ImageTk.PhotoImage(img)
    
    # Update the image on the canvas
    image_canvas.config(image=img_tk)
    image_canvas.image = img_tk  # Store a reference to avoid garbage collection

# Run the GUI loop
window.mainloop()

## Working on Individual fields for super powerful search.

In [5]:
!pip freeze > requirements.txt

In [2]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'
    ordering_part_number_pattern = r'\b(?:OPN|Ordering Part Number|Part Number|Order Number|Item Number)\s*:\s*([a-zA-Z0-9\-]+)\b'

    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)
    ordering_part_numbers = re.findall(ordering_part_number_pattern, text)

    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:
            voltage_ranges.append(volt[2])

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int),
        "Ordering Part Number": sorted(set(ordering_part_numbers), key=lambda x: x)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_similarity(input_data, pdf_data):
    total_similarity = 0.0
    num_fields = 0

    # Function to compare numerical values with tolerance
    def compare_with_tolerance(input_list, pdf_list, tolerance=5):
        if not input_list or not pdf_list:
            return 0.0
        max_similarity = 0.0
        threshold_met = False  # To track if any values match the tolerance criteria

        # Compare each input entry with every pdf entry
        for input_value in input_list:
            if input_value == '':
                continue
            try:
                input_value = float(input_value)
                for pdf_value in pdf_list:
                    pdf_value = float(pdf_value)
                    similarity = 1 - abs(input_value - pdf_value) / input_value

                    # Only consider values that are close enough based on tolerance
                    if similarity >= 1 - (tolerance / 100):
                        max_similarity = max(max_similarity, similarity)
                        threshold_met = True  # Mark if any value is within the threshold
            except ValueError:
                continue

        # If no values matched the tolerance, return 0 similarity
        if not threshold_met:
            return 0.0
        return max_similarity

    # Define specific tolerances for different fields
    tolerances = {
        "Power (W)": 5,  # Strict tolerance for Power (W)
        "Voltage (V)": 10,  # A little leniency for Voltage ranges
        "Current (A)": 5,
        "Lumens": 10,
        "Efficacy (lm/W)": 10,
    }

    # Compare numerical fields
    numerical_fields = ["Power (W)", "Voltage (V)", "Current (A)", "Lumens", "Efficacy (lm/W)"]
    for field in numerical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            # Use the field-specific tolerance, default to 5% if not specified
            tolerance = tolerances.get(field, 5)
            similarity = compare_with_tolerance(input_data[field], pdf_data[field], tolerance=tolerance)
            total_similarity += similarity

    # Compare categorical fields using fuzzy matching
    categorical_fields = ["CCT (K)", "Beam Angles (°)", "Ordering Part Number"]
    for field in categorical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            match_scores = []
            for item in input_data[field]:
                if item == '':
                    continue
                best_match = process.extractOne(item, pdf_data[field], scorer=fuzz.token_sort_ratio)
                if best_match:
                    similarity = best_match[1] / 100.0
                    match_scores.append(similarity)
            if match_scores:
                total_similarity += max(match_scores)
            else:
                total_similarity += 0.0

    # Avoid division by zero
    if num_fields > 0:
        average_similarity = total_similarity / num_fields
    else:
        average_similarity = 0.0

    return average_similarity

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Find Top 10 similar PDFs with improved progress bar
def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    pdf_scores = []
    progress_bar['maximum'] = 0  # Reset progress bar maximum

    start_time = time.time()

    # Use os.walk to traverse through all subdirectories
    for root, _, files in os.walk(folder_path):
        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)  # Update progress bar maximum

        for pdf_file in pdf_files:
            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            if input_image_path:
                images = extract_images_from_pdf(pdf_file)
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            pdf_scores.append((pdf_file, combined_score))
            progress_bar['value'] += 1

            # Calculate elapsed time and ETA
            elapsed_time = time.time() - start_time
            processed_pdfs = progress_bar['value']
            total_pdfs = progress_bar['maximum']
            
            # Avoid division by zero
            if processed_pdfs > 0:  
                estimated_time_per_pdf = elapsed_time / processed_pdfs
                remaining_pdfs = total_pdfs - processed_pdfs
                eta = remaining_pdfs * estimated_time_per_pdf

                # Format time to show minutes and seconds
                eta_minutes, eta_seconds = divmod(eta, 60)
                eta_formatted = f"{int(eta_minutes)} min {int(eta_seconds)} sec"

                # Update progress label with percentage and ETA
                percentage = (processed_pdfs / total_pdfs) * 100
                progress_label.config(text=f"Progress: {processed_pdfs}/{total_pdfs} ({percentage:.2f}%) | ETA: {eta_formatted}")
            
            window.update()

    # Sort and get top 10 results
    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    # Reset progress bar and label
    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)
    for i, (pdf_path, score) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        # Add hyperlink functionality for each PDF path
        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")
        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Voltage (V)": [voltage_entry.get()],
        "Current (A)": [current_entry.get()],
        "Efficacy (lm/W)": [efficacy_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Lumens": [lumens_entry.get()],
        "Beam Angles (°)": [beam_angle_entry.get()],
        "Ordering Part Number": [ordering_part_number_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = tk.Tk()
window.title("PDF Similarity Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Title Label for "Cross Validation Tool"
title_label_cv = tk.Label(window, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=2, pady=20, sticky="w")  # Align to the west (left)

# Folder selection
folder_label = tk.Label(window, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=1, column=0, padx=10, pady=10, sticky=tk.W)
folder_entry = tk.Entry(window, width=50, font=entry_font)
folder_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(window, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=1, column=2, padx=10, pady=10)

# Image selection
image_label = tk.Label(window, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=2, column=0, padx=10, pady=10, sticky=tk.W)
image_entry = tk.Entry(window, width=50, font=entry_font)
image_entry.grid(row=2, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(window, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=2, column=2, padx=10, pady=10)

# Input fields for specifications
labels = ["Power (W):", "Voltage (V):", "Current (A):", "Efficacy (lm/W):", "CCT (K):", "Lumens (lm):", "Beam Angle (°):", "Ordering Part Number:"]
entries = []

for i, label_text in enumerate(labels):
    label = tk.Label(window, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i + 3, column=0, padx=10, pady=5, sticky=tk.W)
    entry = tk.Entry(window, width=20, font=entry_font)  # Reduced the width from 50 to 20
    entry.grid(row=i + 3, column=1, padx=10, pady=5, sticky="ew")
    entries.append(entry)

power_entry, voltage_entry, current_entry, efficacy_entry, cct_entry, lumens_entry, beam_angle_entry, ordering_part_number_entry = entries

# Create a frame for the progress bar and label
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=14, column=0, columnspan=3, padx=10, pady=10, sticky="ew")  # Place it below other elements

# Progress bar inside the frame
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=500, mode="determinate")  # Increased length
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)  # Align left

# Progress label inside the frame
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)  # Align left next to the progress bar

# Result box (scrollable) below the image
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=4, column=3, rowspan=7, padx=10, pady=10, sticky="nsew")
result_frame.grid_columnconfigure(0, weight=1)  # Allow the frame to expand with the window

result_box = Text(result_frame, height=15, width=80, font=entry_font, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

scrollbar = ttk.Scrollbar(result_frame, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Submit button (Find Top 10 PDFs)
submit_button = tk.Button(window, text="Find Top 10 PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=20)  # Set a reduced width for the button
submit_button.grid(row=13, column=3, padx=10, pady=10, sticky="ew")

# Add "Clear Sections" button
clear_button = tk.Button(window, text="Clear Sections", command=lambda: clear_sections(), **button_style)
clear_button.config(width=20)  # Set a reduced width for the button
clear_button.grid(row=14, column=3, padx=10, pady=10, sticky="ew")

# Image display (right side of the window)
image_display_label = tk.Label(window, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=1, column=3, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the right side
image_canvas = tk.Label(window, bg="#f2f2f2")
image_canvas.grid(row=2, column=3, padx=10, pady=(0, 20), sticky=tk.NW)  # Ensure it's right under the label

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in entries:
        entry.delete(0, tk.END)
    
    # Clear the folder and image entries
    folder_entry.delete(0, tk.END)
    image_entry.delete(0, tk.END)
    
    # Reset the image display
    image_canvas.config(image='')  # Clear the image on the right side
    
    # Optionally clear the result box as well
    result_box.delete(1.0, tk.END)

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Function to display the image on the right side
def display_image(image_path):
    # Open the image file
    img = Image.open(image_path)
    
    # Define the maximum width and height for the image
    max_width = 250  # You can adjust this value as needed
    max_height = 200  # You can adjust this value as needed
    
    # Resize the image while maintaining the aspect ratio
    img.thumbnail((max_width, max_height))  # Resize within the specified dimensions
    
    # Convert the image to a format Tkinter can handle
    img_tk = ImageTk.PhotoImage(img)
    
    # Update the image on the canvas
    image_canvas.config(image=img_tk)
    image_canvas.image = img_tk  # Store a reference to avoid garbage collection

# Run the GUI loop
window.mainloop()

## Drag and Drop option

In [6]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinterdnd2 import DND_FILES, TkinterDnD  # Import the drag-and-drop library
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'
    ordering_part_number_pattern = r'\b(?:OPN|Ordering Part Number|Part Number|Order Number|Item Number)\s*:\s*([a-zA-Z0-9\-]+)\b'

    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)
    ordering_part_numbers = re.findall(ordering_part_number_pattern, text)

    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:
            voltage_ranges.append(volt[2])

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int),
        "Ordering Part Number": sorted(set(ordering_part_numbers), key=lambda x: x)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_similarity(input_data, pdf_data):
    total_similarity = 0.0
    num_fields = 0

    # Function to compare numerical values with tolerance
    def compare_with_tolerance(input_list, pdf_list, tolerance=5):
        if not input_list or not pdf_list:
            return 0.0
        max_similarity = 0.0
        threshold_met = False  # To track if any values match the tolerance criteria

        # Compare each input entry with every pdf entry
        for input_value in input_list:
            if input_value == '':
                continue
            try:
                input_value = float(input_value)
                for pdf_value in pdf_list:
                    pdf_value = float(pdf_value)
                    similarity = 1 - abs(input_value - pdf_value) / input_value

                    # Only consider values that are close enough based on tolerance
                    if similarity >= 1 - (tolerance / 100):
                        max_similarity = max(max_similarity, similarity)
                        threshold_met = True  # Mark if any value is within the threshold
            except ValueError:
                continue

        # If no values matched the tolerance, return 0 similarity
        if not threshold_met:
            return 0.0
        return max_similarity

    # Define specific tolerances for different fields
    tolerances = {
        "Power (W)": 5,  # Strict tolerance for Power (W)
        "Voltage (V)": 10,  # A little leniency for Voltage ranges
        "Current (A)": 5,
        "Lumens": 10,
        "Efficacy (lm/W)": 10,
    }

    # Compare numerical fields
    numerical_fields = ["Power (W)", "Voltage (V)", "Current (A)", "Lumens", "Efficacy (lm/W)"]
    for field in numerical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            # Use the field-specific tolerance, default to 5% if not specified
            tolerance = tolerances.get(field, 5)
            similarity = compare_with_tolerance(input_data[field], pdf_data[field], tolerance=tolerance)
            total_similarity += similarity

    # Compare categorical fields using fuzzy matching
    categorical_fields = ["CCT (K)", "Beam Angles (°)", "Ordering Part Number"]
    for field in categorical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            match_scores = []
            for item in input_data[field]:
                if item == '':
                    continue
                best_match = process.extractOne(item, pdf_data[field], scorer=fuzz.token_sort_ratio)
                if best_match:
                    similarity = best_match[1] / 100.0
                    match_scores.append(similarity)
            if match_scores:
                total_similarity += max(match_scores)
            else:
                total_similarity += 0.0

    # Avoid division by zero
    if num_fields > 0:
        average_similarity = total_similarity / num_fields
    else:
        average_similarity = 0.0

    return average_similarity

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Find Top 10 similar PDFs with improved progress bar
def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    pdf_scores = []
    progress_bar['maximum'] = 0  # Reset progress bar maximum

    start_time = time.time()

    # Use os.walk to traverse through all subdirectories
    for root, _, files in os.walk(folder_path):
        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)  # Update progress bar maximum

        for pdf_file in pdf_files:
            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            if input_image_path:
                images = extract_images_from_pdf(pdf_file)
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            pdf_scores.append((pdf_file, combined_score))
            progress_bar['value'] += 1

            # Calculate elapsed time and ETA
            elapsed_time = time.time() - start_time
            processed_pdfs = progress_bar['value']
            total_pdfs = progress_bar['maximum']
            
            # Avoid division by zero
            if processed_pdfs > 0:  
                estimated_time_per_pdf = elapsed_time / processed_pdfs
                remaining_pdfs = total_pdfs - processed_pdfs
                eta = remaining_pdfs * estimated_time_per_pdf

                # Format time to show minutes and seconds
                eta_minutes, eta_seconds = divmod(eta, 60)
                eta_formatted = f"{int(eta_minutes)} min {int(eta_seconds)} sec"

                # Update progress label with percentage and ETA
                percentage = (processed_pdfs / total_pdfs) * 100
                progress_label.config(text=f"Progress: {processed_pdfs}/{total_pdfs} ({percentage:.2f}%) | ETA: {eta_formatted}")
            
            window.update()

    # Sort and get top 10 results
    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    # Reset progress bar and label
    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)
    for i, (pdf_path, score) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        # Add hyperlink functionality for each PDF path
        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")
        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Voltage (V)": [voltage_entry.get()],
        "Current (A)": [current_entry.get()],
        "Efficacy (lm/W)": [efficacy_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Lumens": [lumens_entry.get()],
        "Beam Angles (°)": [beam_angle_entry.get()],
        "Ordering Part Number": [ordering_part_number_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = TkinterDnD.Tk()
window.title("PDF Similarity Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Title Label for "Cross Validation Tool"
title_label_cv = tk.Label(window, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=2, pady=20, sticky="w")  # Align to the west (left)

# Folder selection
folder_label = tk.Label(window, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=1, column=0, padx=10, pady=10, sticky=tk.W)
folder_entry = tk.Entry(window, width=50, font=entry_font)
folder_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(window, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=1, column=2, padx=10, pady=10)

# Image selection
image_label = tk.Label(window, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=2, column=0, padx=10, pady=10, sticky=tk.W)
image_entry = tk.Entry(window, width=50, font=entry_font)
image_entry.grid(row=2, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(window, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=2, column=2, padx=10, pady=10)

# Input fields for specifications
labels = ["Power (W):", "Voltage (V):", "Current (A):", "Efficacy (lm/W):", "CCT (K):", "Lumens (lm):", "Beam Angle (°):", "Ordering Part Number:"]
entries = []

for i, label_text in enumerate(labels):
    label = tk.Label(window, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i + 3, column=0, padx=10, pady=5, sticky=tk.W)
    entry = tk.Entry(window, width=20, font=entry_font)  # Reduced the width from 50 to 20
    entry.grid(row=i + 3, column=1, padx=10, pady=5, sticky="ew")
    entries.append(entry)

power_entry, voltage_entry, current_entry, efficacy_entry, cct_entry, lumens_entry, beam_angle_entry, ordering_part_number_entry = entries

# Create a frame for the progress bar and label
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=14, column=0, columnspan=3, padx=10, pady=10, sticky="ew")  # Place it below other elements

# Progress bar inside the frame
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=500, mode="determinate")  # Increased length
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)  # Align left

# Progress label inside the frame
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)  # Align left next to the progress bar

# Result box (scrollable) below the image
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=4, column=3, rowspan=7, padx=10, pady=10, sticky="nsew")
result_frame.grid_columnconfigure(0, weight=1)  # Allow the frame to expand with the window

result_box = Text(result_frame, height=15, width=80, font=entry_font, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

scrollbar = ttk.Scrollbar(result_frame, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Submit button (Find Top 10 PDFs)
submit_button = tk.Button(window, text="Find Top 10 PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=20)  # Set a reduced width for the button
submit_button.grid(row=13, column=3, padx=10, pady=10, sticky="ew")

# Add "Clear Sections" button
clear_button = tk.Button(window, text="Clear Sections", command=lambda: clear_sections(), **button_style)
clear_button.config(width=20)  # Set a reduced width for the button
clear_button.grid(row=14, column=3, padx=10, pady=10, sticky="ew")

# Image display (right side of the window)
image_display_label = tk.Label(window, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=1, column=3, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the right side
image_canvas = tk.Label(window, bg="#f2f2f2")
image_canvas.grid(row=2, column=3, padx=10, pady=(0, 20), sticky=tk.NW)  # Ensure it's right under the label

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in entries:
        entry.delete(0, tk.END)
    
    # Clear the folder and image entries
    folder_entry.delete(0, tk.END)
    image_entry.delete(0, tk.END)
    
    # Reset the image display
    image_canvas.config(image='')  # Clear the image on the right side
    
    # Optionally clear the result box as well
    result_box.delete(1.0, tk.END)

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Function to display the image on the right side
def display_image(image_path):
    # Open the image file
    img = Image.open(image_path)
    
    # Define the maximum width and height for the image
    max_width = 250  # You can adjust this value as needed
    max_height = 200  # You can adjust this value as needed
    
    # Resize the image while maintaining the aspect ratio
    img.thumbnail((max_width, max_height))  # Resize within the specified dimensions
    
    # Convert the image to a format Tkinter can handle
    img_tk = ImageTk.PhotoImage(img)
    
    # Update the image on the canvas
    image_canvas.config(image=img_tk)
    image_canvas.image = img_tk  # Store a reference to avoid garbage collection

# Function for handling the drop of files (drag-and-drop)
def drop(event):
    image_path = event.data  # Get the dropped file path
    
    # Clean up the file path
    if image_path.startswith('{') and image_path.endswith('}'):
        image_path = image_path[1:-1]  # Remove the curly braces
    
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Enable drag-and-drop for the main window
window.drop_target_register(DND_FILES)  # Register the main window for drag-and-drop of files
window.dnd_bind('<<Drop>>', drop)  # Bind the drop event to the drop function

# Run the GUI loop
window.mainloop()

## Adding the Category, Subcategory of the product to the Specifications

In [2]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinterdnd2 import DND_FILES, TkinterDnD  # Import the drag-and-drop library
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    category_pattern = r'\b(COMMERCIAL LIGHTING|INDUSTRIAL LIGHTING|HAZARDOUS LIGHTING)\s*\|?\s*(DOWNLIGHT|PANEL LIGHT|TROFFERS|LINEAR LOW BAYS|MAGNETIC STRIPS|TUBES|RETROFIT LAMPS|HIGH BAY|FLOOD LIGHTS|CANOPY LIGHTS|HIGH MAST LIGHTS|YARD LIGHTS|AREA LUMINAIRES|WALL PACKS|VAPOR TIGHT LIGHTS|AREA LIGHTS|JELLY JAR LIGHTS|DROP LIGHT)\b'
    power_pattern = r'\b(\d{1,4})\s*W\b'
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'
    ordering_part_number_pattern = r'\b(?:OPN|Ordering Part Number|Part Number|Order Number|Item Number)\s*:\s*([a-zA-Z0-9\-]+)\b'

    category = re.findall(category_pattern, text)
    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)
    ordering_part_numbers = re.findall(ordering_part_number_pattern, text)

    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:
            voltage_ranges.append(volt[2])

    extracted_data = {
        "Category": sorted(set(category)),
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int),
        "Ordering Part Number": sorted(set(ordering_part_numbers), key=lambda x: x)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_similarity(input_data, pdf_data):
    total_similarity = 0.0
    num_fields = 0

    # Function to compare numerical values with tolerance
    def compare_with_tolerance(input_list, pdf_list, tolerance=5):
        if not input_list or not pdf_list:
            return 0.0
        max_similarity = 0.0
        threshold_met = False  # To track if any values match the tolerance criteria

        # Compare each input entry with every pdf entry
        for input_value in input_list:
            if input_value == '':
                continue
            try:
                input_value = float(input_value)
                for pdf_value in pdf_list:
                    pdf_value = float(pdf_value)
                    similarity = 1 - abs(input_value - pdf_value) / input_value

                    # Only consider values that are close enough based on tolerance
                    if similarity >= 1 - (tolerance / 100):
                        max_similarity = max(max_similarity, similarity)
                        threshold_met = True  # Mark if any value is within the threshold
            except ValueError:
                continue

        # If no values matched the tolerance, return 0 similarity
        if not threshold_met:
            return 0.0
        return max_similarity

    # Define specific tolerances for different fields
    tolerances = {
        "Category": 4,
        "Power (W)": 2,  # Strict tolerance for Power (W)
        "Voltage (V)": 10,  # A little leniency for Voltage ranges
        "Current (A)": 5,
        "Lumens": 10,
        "Efficacy (lm/W)": 10,
    }

    # Compare numerical fields
    numerical_fields = ["Power (W)", "Voltage (V)", "Current (A)", "Lumens", "Efficacy (lm/W)"]
    for field in numerical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            # Use the field-specific tolerance, default to 5% if not specified
            tolerance = tolerances.get(field, 5)
            similarity = compare_with_tolerance(input_data[field], pdf_data[field], tolerance=tolerance)
            total_similarity += similarity

    # Compare categorical fields using fuzzy matching
    categorical_fields = ["Category", "CCT (K)", "Beam Angles (°)", "Ordering Part Number"]
    for field in categorical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            match_scores = []
            for item in input_data[field]:
                if item == '':
                    continue
                best_match = process.extractOne(item, pdf_data[field], scorer=fuzz.token_sort_ratio)
                if best_match:
                    similarity = best_match[1] / 100.0
                    match_scores.append(similarity)
            if match_scores:
                total_similarity += max(match_scores)
            else:
                total_similarity += 0.0

    # Avoid division by zero
    if num_fields > 0:
        average_similarity = total_similarity / num_fields
    else:
        average_similarity = 0.0

    return average_similarity

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Find Top 10 similar PDFs with improved progress bar
def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    pdf_scores = []
    progress_bar['maximum'] = 0  # Reset progress bar maximum

    start_time = time.time()

    # Use os.walk to traverse through all subdirectories
    for root, _, files in os.walk(folder_path):
        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)  # Update progress bar maximum

        for pdf_file in pdf_files:
            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            if input_image_path:
                images = extract_images_from_pdf(pdf_file)
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            pdf_scores.append((pdf_file, combined_score))
            progress_bar['value'] += 1

            # Calculate elapsed time and ETA
            elapsed_time = time.time() - start_time
            processed_pdfs = progress_bar['value']
            total_pdfs = progress_bar['maximum']
            
            # Avoid division by zero
            if processed_pdfs > 0:  
                estimated_time_per_pdf = elapsed_time / processed_pdfs
                remaining_pdfs = total_pdfs - processed_pdfs
                eta = remaining_pdfs * estimated_time_per_pdf

                # Format time to show minutes and seconds
                eta_minutes, eta_seconds = divmod(eta, 60)
                eta_formatted = f"{int(eta_minutes)} min {int(eta_seconds)} sec"

                # Update progress label with percentage and ETA
                percentage = (processed_pdfs / total_pdfs) * 100
                progress_label.config(text=f"Progress: {processed_pdfs}/{total_pdfs} ({percentage:.2f}%) | ETA: {eta_formatted}")
            
            window.update()

    # Sort and get top 10 results
    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    # Reset progress bar and label
    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)
    for i, (pdf_path, score) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        # Add hyperlink functionality for each PDF path
        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")
        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Category": [category_entry.get()],
        "Power (W)": [power_entry.get()],
        "Voltage (V)": [voltage_entry.get()],
        "Current (A)": [current_entry.get()],
        "Efficacy (lm/W)": [efficacy_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Lumens": [lumens_entry.get()],
        "Beam Angles (°)": [beam_angle_entry.get()],
        "Ordering Part Number": [ordering_part_number_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = TkinterDnD.Tk()
window.title("Product Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Title Label for "Cross Validation Tool"
title_label_cv = tk.Label(window, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=2, pady=20, sticky="w")  # Align to the west (left)

# Folder selection
folder_label = tk.Label(window, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=1, column=0, padx=10, pady=10, sticky=tk.W)
folder_entry = tk.Entry(window, width=50, font=entry_font)
folder_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(window, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=1, column=2, padx=10, pady=10)

# Image selection
image_label = tk.Label(window, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=2, column=0, padx=10, pady=10, sticky=tk.W)
image_entry = tk.Entry(window, width=50, font=entry_font)
image_entry.grid(row=2, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(window, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=2, column=2, padx=10, pady=10)

# Input fields for specifications
labels = ["Category:", "Power (W):", "Voltage (V):", "Current (A):", "Efficacy (lm/W):", "CCT (K):", "Lumens (lm):", "Beam Angle (°):", "Ordering Part Number:"]
entries = []

for i, label_text in enumerate(labels):
    label = tk.Label(window, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i + 3, column=0, padx=10, pady=5, sticky=tk.W)
    entry = tk.Entry(window, width=20, font=entry_font)  # Reduced the width from 50 to 20
    entry.grid(row=i + 3, column=1, padx=10, pady=5, sticky="ew")
    entries.append(entry)

category_entry, power_entry, voltage_entry, current_entry, efficacy_entry, cct_entry, lumens_entry, beam_angle_entry, ordering_part_number_entry = entries

# Create a frame for the progress bar and label
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=14, column=0, columnspan=3, padx=10, pady=10, sticky="ew")  # Place it below other elements

# Progress bar inside the frame
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=500, mode="determinate")  # Increased length
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)  # Align left

# Progress label inside the frame
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)  # Align left next to the progress bar

# Result box (scrollable) below the image
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=4, column=3, rowspan=7, padx=10, pady=10, sticky="nsew")
result_frame.grid_columnconfigure(0, weight=1)  # Allow the frame to expand with the window

result_box = Text(result_frame, height=15, width=80, font=entry_font, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

scrollbar = ttk.Scrollbar(result_frame, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Submit button (Find Top 10 PDFs)
submit_button = tk.Button(window, text="Find Top 10 PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=20)  # Set a reduced width for the button
submit_button.grid(row=13, column=3, padx=10, pady=10, sticky="ew")

# Add "Clear Sections" button
clear_button = tk.Button(window, text="Clear Sections", command=lambda: clear_sections(), **button_style)
clear_button.config(width=20)  # Set a reduced width for the button
clear_button.grid(row=14, column=3, padx=10, pady=10, sticky="ew")

# Image display (right side of the window)
image_display_label = tk.Label(window, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=1, column=3, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the right side
image_canvas = tk.Label(window, bg="#f2f2f2")
image_canvas.grid(row=2, column=3, padx=10, pady=(0, 20), sticky=tk.NW)  # Ensure it's right under the label

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in entries:
        entry.delete(0, tk.END)
    
    # Clear the folder and image entries
    folder_entry.delete(0, tk.END)
    image_entry.delete(0, tk.END)
    
    # Reset the image display
    image_canvas.config(image='')  # Clear the image on the right side
    
    # Optionally clear the result box as well
    result_box.delete(1.0, tk.END)

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Function to display the image on the right side
def display_image(image_path):
    # Open the image file
    img = Image.open(image_path)
    
    # Define the maximum width and height for the image
    max_width = 250  # You can adjust this value as needed
    max_height = 200  # You can adjust this value as needed
    
    # Resize the image while maintaining the aspect ratio
    img.thumbnail((max_width, max_height))  # Resize within the specified dimensions
    
    # Convert the image to a format Tkinter can handle
    img_tk = ImageTk.PhotoImage(img)
    
    # Update the image on the canvas
    image_canvas.config(image=img_tk)
    image_canvas.image = img_tk  # Store a reference to avoid garbage collection

# Function for handling the drop of files (drag-and-drop)
def drop(event):
    image_path = event.data  # Get the dropped file path
    
    # Clean up the file path
    if image_path.startswith('{') and image_path.endswith('}'):
        image_path = image_path[1:-1]  # Remove the curly braces
    
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Enable drag-and-drop for the main window
window.drop_target_register(DND_FILES)  # Register the main window for drag-and-drop of files
window.dnd_bind('<<Drop>>', drop)  # Bind the drop event to the drop function

# Run the GUI loop
window.mainloop()