## Fetching SKU

In [3]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinterdnd2 import DND_FILES, TkinterDnD  # Import the drag-and-drop library
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    size_pattern = r'\b(?:\d+(?:\.\d+)?(?:\s*x\s*\d+(?:\.\d+)?)?|NA)\b'
    name_pattern = r'(?i)(T\d+\s*Tube\s*Type\s*[A-C](\/B)?\s*(\(\d+[′′]?\))?)|(\bSigma\b\s*T\d+\s*Emergency\s*Tubes\s*Type\s*[A-B])|(\bT\d+\s*Glass\s*Tube\s*Type\s*[A-B](\(\d+[′′]?\))?)|(\bSimplex\s*Linear\s*Low\s*Bay\b)|(\bDelphi\b\s*(CCT\s*&\s*Power\s*Selectable\s*Back\s*-?\s*Lit\s*Panel\s*Light\s*\d+[’′x\d+’′]*)|(\bLocus\b\s*Area\s*Luminaire\b)|(\bFrizo\b\s*Refrigeration\s*Light\b)|(\bDynamo\s*\d+\s*Stadium\s*Flood\s*Light\b)|(\bVIGOR\b\s*Explosion\s*Proof\s*Round\s*High\s*Bay\b)|(\bHexacule\b\s*High\s*Mast\s*Light\b)|(\bRaydent\b\s*Stadium\s*Flood\s*Light\b)|(\bTrilevel\b\s*Dimming\s*Sensor\b)|(\bBilevel\b\s*Dimming\s*Sensor\b))'
    productcode_pattern = r'IK(?:-?[A-Z]+\d{1,4}(?:-\d{1,2}[A-Z]?(?:&[A-Z]?)?|-?\(\w+\)|-\w+)?(?:-\w+)?(?:-\d{1,2}[A-Z]?(?:\s*\(\w+\))?)?(\[\w+(?:,\s*\w+)?\])?(?:[-/]\w+)?(\s*\(\w+\))?|\(\w+\)|[-/]\w+)?$'
    sku_pattern = r'\b\d{12}\b'

    power = re.findall(power_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    cct = re.findall(cct_pattern, text)
    size = re.findall(size_pattern, text)
    name = re.findall(name_pattern, text)
    product_code = re.findall(productcode_pattern, text)
    sku = re.findall(sku_pattern, text)

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Size": sorted(set(size), key=lambda x: (len(x.split('x')), x)),  # Sort by dimensions
        "Name": sorted(set(name), key=str),
        "Product Code": sorted(set(product_code), key=str),
        "SKU": sorted(set(sku), key=str)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_similarity(input_data, pdf_data):
    total_similarity = 0.0
    num_fields = 0

    # Function to compare numerical values with tolerance
    def compare_with_tolerance(input_list, pdf_list, tolerance=5):
        if not input_list or not pdf_list:
            return 0.0
        max_similarity = 0.0
        threshold_met = False  # To track if any values match the tolerance criteria

        # Compare each input entry with every pdf entry
        for input_value in input_list:
            if input_value == '':
                continue
            try:
                input_value = float(input_value)
                for pdf_value in pdf_list:
                    pdf_value = float(pdf_value)
                    similarity = 1 - abs(input_value - pdf_value) / input_value

                    # Only consider values that are close enough based on tolerance
                    if similarity >= 1 - (tolerance / 100):
                        max_similarity = max(max_similarity, similarity)
                        threshold_met = True  # Mark if any value is within the threshold
            except ValueError:
                continue

        # If no values matched the tolerance, return 0 similarity
        if not threshold_met:
            return 0.0
        return max_similarity

    # Define specific tolerances for different fields
    tolerances = {
        "Power (W)": 2,  # Strict tolerance for Power (W)
        "Lumens": 5,
        "CCT (K)": 7,
        "Size": 6,
        "Name": 4,
        "Product Code": 10,
        "SKU": 10,
    }

    # Compare numerical fields
    numerical_fields = ["Power (W)", "Lumens", "Size", "SKU"]
    for field in numerical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            # Use the field-specific tolerance, default to 5% if not specified
            tolerance = tolerances.get(field, 5)
            similarity = compare_with_tolerance(input_data[field], pdf_data[field], tolerance=tolerance)
            total_similarity += similarity

    # Compare categorical fields using fuzzy matching
    categorical_fields = ["CCT (K)", "Name", "Product Code"]
    for field in categorical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            match_scores = []
            for item in input_data[field]:
                if item == '':
                    continue
                best_match = process.extractOne(item, pdf_data[field], scorer=fuzz.token_sort_ratio)
                if best_match:
                    similarity = best_match[1] / 100.0
                    match_scores.append(similarity)
            if match_scores:
                total_similarity += max(match_scores)
            else:
                total_similarity += 0.0

    # Avoid division by zero
    if num_fields > 0:
        average_similarity = total_similarity / num_fields
    else:
        average_similarity = 0.0

    return average_similarity

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Find Top 10 similar PDFs with improved progress bar
def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    pdf_scores = []
    progress_bar['maximum'] = 0  # Reset progress bar maximum

    start_time = time.time()

    # Use os.walk to traverse through all subdirectories
    for root, _, files in os.walk(folder_path):
        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)  # Update progress bar maximum

        for pdf_file in pdf_files:
            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            if input_image_path:
                images = extract_images_from_pdf(pdf_file)
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            pdf_scores.append((pdf_file, combined_score))
            progress_bar['value'] += 1

            # Calculate elapsed time and ETA
            elapsed_time = time.time() - start_time
            processed_pdfs = progress_bar['value']
            total_pdfs = progress_bar['maximum']
            
            # Avoid division by zero
            if processed_pdfs > 0:  
                estimated_time_per_pdf = elapsed_time / processed_pdfs
                remaining_pdfs = total_pdfs - processed_pdfs
                eta = remaining_pdfs * estimated_time_per_pdf

                # Format time to show minutes and seconds
                eta_minutes, eta_seconds = divmod(eta, 60)
                eta_formatted = f"{int(eta_minutes)} min {int(eta_seconds)} sec"

                # Update progress label with percentage and ETA
                percentage = (processed_pdfs / total_pdfs) * 100
                progress_label.config(text=f"Progress: {processed_pdfs}/{total_pdfs} ({percentage:.2f}%) | ETA: {eta_formatted}")
            
            window.update()

    # Sort and get top 10 results
    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    # Reset progress bar and label
    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)
    for i, (pdf_path, score) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        # Add hyperlink functionality for each PDF path
        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")
        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Lumens": [lumens_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Size": [size_entry.get()],
        "Name": [name_entry.get()],
        "Product Code": [product_code_entry.get()],
        "SKU": [sku_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = TkinterDnD.Tk()
window.title("Product Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Title Label for "Cross Validation Tool"
title_label_cv = tk.Label(window, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=2, pady=20, sticky="w")  # Align to the west (left)

# Folder selection
folder_label = tk.Label(window, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=1, column=0, padx=10, pady=10, sticky=tk.W)
folder_entry = tk.Entry(window, width=50, font=entry_font)
folder_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(window, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=1, column=2, padx=10, pady=10)

# Image selection
image_label = tk.Label(window, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=2, column=0, padx=10, pady=10, sticky=tk.W)
image_entry = tk.Entry(window, width=50, font=entry_font)
image_entry.grid(row=2, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(window, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=2, column=2, padx=10, pady=10)

# Input fields for specifications
labels = ["Power (W):", "Lumens (lm):", "CCT (K):", "Size:", "Name:", "Product Code:", "SKU:"]
entries = []

for i, label_text in enumerate(labels):
    label = tk.Label(window, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i + 3, column=0, padx=10, pady=5, sticky=tk.W)
    entry = tk.Entry(window, width=20, font=entry_font)  # Reduced the width from 50 to 20
    entry.grid(row=i + 3, column=1, padx=10, pady=5, sticky="ew")
    entries.append(entry)

power_entry, lumens_entry, cct_entry, size_entry, name_entry, product_code_entry, sku_entry = entries

# Create a frame for the progress bar and label
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=14, column=0, columnspan=3, padx=10, pady=10, sticky="ew")  # Place it below other elements

# Progress bar inside the frame
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=500, mode="determinate")  # Increased length
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)  # Align left

# Progress label inside the frame
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)  # Align left next to the progress bar

# Result box (scrollable) below the image
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=4, column=3, rowspan=7, padx=10, pady=10, sticky="nsew")
result_frame.grid_columnconfigure(0, weight=1)  # Allow the frame to expand with the window

result_box = Text(result_frame, height=15, width=80, font=entry_font, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

scrollbar = ttk.Scrollbar(result_frame, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Submit button (Find Top 10 PDFs)
submit_button = tk.Button(window, text="Find Top 10 PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=20)  # Set a reduced width for the button
submit_button.grid(row=13, column=3, padx=10, pady=10, sticky="ew")

# Add "Clear Sections" button
clear_button = tk.Button(window, text="Clear Sections", command=lambda: clear_sections(), **button_style)
clear_button.config(width=20)  # Set a reduced width for the button
clear_button.grid(row=14, column=3, padx=10, pady=10, sticky="ew")

# Image display (right side of the window)
image_display_label = tk.Label(window, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=1, column=3, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the right side
image_canvas = tk.Label(window, bg="#f2f2f2")
image_canvas.grid(row=2, column=3, padx=10, pady=(0, 20), sticky=tk.NW)  # Ensure it's right under the label

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in entries:
        entry.delete(0, tk.END)
    
    # Clear the folder and image entries
    folder_entry.delete(0, tk.END)
    image_entry.delete(0, tk.END)
    
    # Reset the image display
    image_canvas.config(image='')  # Clear the image on the right side
    
    # Optionally clear the result box as well
    result_box.delete(1.0, tk.END)

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Function to display the image on the right side
def display_image(image_path):
    # Open the image file
    img = Image.open(image_path)
    
    # Define the maximum width and height for the image
    max_width = 250  # You can adjust this value as needed
    max_height = 200  # You can adjust this value as needed
    
    # Resize the image while maintaining the aspect ratio
    img.thumbnail((max_width, max_height))  # Resize within the specified dimensions
    
    # Convert the image to a format Tkinter can handle
    img_tk = ImageTk.PhotoImage(img)
    
    # Update the image on the canvas
    image_canvas.config(image=img_tk)
    image_canvas.image = img_tk  # Store a reference to avoid garbage collection

# Function for handling the drop of files (drag-and-drop)
def drop(event):
    image_path = event.data  # Get the dropped file path
    
    # Clean up the file path
    if image_path.startswith('{') and image_path.endswith('}'):
        image_path = image_path[1:-1]  # Remove the curly braces
    
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Enable drag-and-drop for the main window
window.drop_target_register(DND_FILES)  # Register the main window for drag-and-drop of files
window.dnd_bind('<<Drop>>', drop)  # Bind the drop event to the drop function

# Run the GUI loop
window.mainloop()

## Fetching Name

In [44]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinterdnd2 import DND_FILES, TkinterDnD  # Import the drag-and-drop library
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    size_pattern = r'\b(?:\d+(?:\.\d+)?(?:\s*x\s*\d+(?:\.\d+)?)?|NA)\b'
    name_pattern = r'\b[A-Z][a-zA-Z0-9\s\-()]*[a-zA-Z)]\b'
    productcode_pattern = r'IK(?:-?[A-Z]+\d{1,4}(?:-\d{1,2}[A-Z]?(?:&[A-Z]?)?|-?\(\w+\)|-\w+)?(?:-\w+)?(?:-\d{1,2}[A-Z]?(?:\s*\(\w+\))?)?(\[\w+(?:,\s*\w+)?\])?(?:[-/]\w+)?(\s*\(\w+\))?|\(\w+\)|[-/]\w+)?$'
    sku_pattern = r'\b\d{12}\b'

    power = re.findall(power_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    cct = re.findall(cct_pattern, text)
    size = re.findall(size_pattern, text)
    name = re.findall(name_pattern, text)
    product_code = re.findall(productcode_pattern, text)
    sku = re.findall(sku_pattern, text)

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Size": sorted(set(size), key=lambda x: (len(x.split('x')), x)),  # Sort by dimensions
        "Name": sorted(set(name), key=str),
        "Product Code": sorted(set(product_code), key=str),
        "SKU": sorted(set(sku), key=str)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_similarity(input_data, pdf_data):
    total_similarity = 0.0
    num_fields = 0

    # Function to compare numerical values with tolerance
    def compare_with_tolerance(input_list, pdf_list, tolerance=5):
        if not input_list or not pdf_list:
            return 0.0
        max_similarity = 0.0
        threshold_met = False  # To track if any values match the tolerance criteria

        # Compare each input entry with every pdf entry
        for input_value in input_list:
            if input_value == '':
                continue
            try:
                input_value = float(input_value)
                for pdf_value in pdf_list:
                    pdf_value = float(pdf_value)
                    similarity = 1 - abs(input_value - pdf_value) / input_value

                    # Only consider values that are close enough based on tolerance
                    if similarity >= 1 - (tolerance / 100):
                        max_similarity = max(max_similarity, similarity)
                        threshold_met = True  # Mark if any value is within the threshold
            except ValueError:
                continue

        # If no values matched the tolerance, return 0 similarity
        if not threshold_met:
            return 0.0
        return max_similarity

    # Define specific tolerances for different fields
    tolerances = {
        "Power (W)": 2,  # Strict tolerance for Power (W)
        "Lumens": 5,
        "CCT (K)": 7,
        "Size": 6,
        "Name": 4,
        "Product Code": 10,
        "SKU": 10,
    }

    # Compare numerical fields
    numerical_fields = ["Power (W)", "Lumens", "Size", "SKU"]
    for field in numerical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            # Use the field-specific tolerance, default to 5% if not specified
            tolerance = tolerances.get(field, 5)
            similarity = compare_with_tolerance(input_data[field], pdf_data[field], tolerance=tolerance)
            total_similarity += similarity

    # Compare categorical fields using fuzzy matching
    categorical_fields = ["CCT (K)", "Name", "Product Code"]
    for field in categorical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            match_scores = []
            for item in input_data[field]:
                if item == '':
                    continue
                best_match = process.extractOne(item, pdf_data[field], scorer=fuzz.token_sort_ratio)
                if best_match:
                    similarity = best_match[1] / 100.0
                    match_scores.append(similarity)
            if match_scores:
                total_similarity += max(match_scores)
            else:
                total_similarity += 0.0

    # Avoid division by zero
    if num_fields > 0:
        average_similarity = total_similarity / num_fields
    else:
        average_similarity = 0.0

    return average_similarity

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Find Top 10 similar PDFs with improved progress bar
def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    pdf_scores = []
    progress_bar['maximum'] = 0  # Reset progress bar maximum

    start_time = time.time()

    # Use os.walk to traverse through all subdirectories
    for root, _, files in os.walk(folder_path):
        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)  # Update progress bar maximum

        for pdf_file in pdf_files:
            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            if input_image_path:
                images = extract_images_from_pdf(pdf_file)
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            pdf_scores.append((pdf_file, combined_score))
            progress_bar['value'] += 1

            # Calculate elapsed time and ETA
            elapsed_time = time.time() - start_time
            processed_pdfs = progress_bar['value']
            total_pdfs = progress_bar['maximum']
            
            # Avoid division by zero
            if processed_pdfs > 0:  
                estimated_time_per_pdf = elapsed_time / processed_pdfs
                remaining_pdfs = total_pdfs - processed_pdfs
                eta = remaining_pdfs * estimated_time_per_pdf

                # Format time to show minutes and seconds
                eta_minutes, eta_seconds = divmod(eta, 60)
                eta_formatted = f"{int(eta_minutes)} min {int(eta_seconds)} sec"

                # Update progress label with percentage and ETA
                percentage = (processed_pdfs / total_pdfs) * 100
                progress_label.config(text=f"Progress: {processed_pdfs}/{total_pdfs} ({percentage:.2f}%) | ETA: {eta_formatted}")
            
            window.update()

    # Sort and get top 10 results
    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    # Reset progress bar and label
    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)
    for i, (pdf_path, score) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        # Add hyperlink functionality for each PDF path
        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")
        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Lumens": [lumens_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Size": [size_entry.get()],
        "Name": [name_entry.get()],
        "Product Code": [product_code_entry.get()],
        "SKU": [sku_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = TkinterDnD.Tk()
window.title("Product Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Title Label for "Cross Validation Tool"
title_label_cv = tk.Label(window, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=2, pady=20, sticky="w")  # Align to the west (left)

# Folder selection
folder_label = tk.Label(window, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=1, column=0, padx=10, pady=10, sticky=tk.W)
folder_entry = tk.Entry(window, width=50, font=entry_font)
folder_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(window, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=1, column=2, padx=10, pady=10)

# Image selection
image_label = tk.Label(window, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=2, column=0, padx=10, pady=10, sticky=tk.W)
image_entry = tk.Entry(window, width=50, font=entry_font)
image_entry.grid(row=2, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(window, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=2, column=2, padx=10, pady=10)

# Input fields for specifications
labels = ["Power (W):", "Lumens (lm):", "CCT (K):", "Size:", "Name:", "Product Code:", "SKU:"]
entries = []

for i, label_text in enumerate(labels):
    label = tk.Label(window, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i + 3, column=0, padx=10, pady=5, sticky=tk.W)
    entry = tk.Entry(window, width=20, font=entry_font)  # Reduced the width from 50 to 20
    entry.grid(row=i + 3, column=1, padx=10, pady=5, sticky="ew")
    entries.append(entry)

power_entry, lumens_entry, cct_entry, size_entry, name_entry, product_code_entry, sku_entry = entries

# Create a frame for the progress bar and label
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=14, column=0, columnspan=3, padx=10, pady=10, sticky="ew")  # Place it below other elements

# Progress bar inside the frame
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=500, mode="determinate")  # Increased length
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)  # Align left

# Progress label inside the frame
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)  # Align left next to the progress bar

# Result box (scrollable) below the image
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=4, column=3, rowspan=7, padx=10, pady=10, sticky="nsew")
result_frame.grid_columnconfigure(0, weight=1)  # Allow the frame to expand with the window

result_box = Text(result_frame, height=15, width=80, font=entry_font, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

scrollbar = ttk.Scrollbar(result_frame, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Submit button (Find Top 10 PDFs)
submit_button = tk.Button(window, text="Find Top 10 PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=20)  # Set a reduced width for the button
submit_button.grid(row=13, column=3, padx=10, pady=10, sticky="ew")

# Add "Clear Sections" button
clear_button = tk.Button(window, text="Clear Sections", command=lambda: clear_sections(), **button_style)
clear_button.config(width=20)  # Set a reduced width for the button
clear_button.grid(row=14, column=3, padx=10, pady=10, sticky="ew")

# Image display (right side of the window)
image_display_label = tk.Label(window, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=1, column=3, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the right side
image_canvas = tk.Label(window, bg="#f2f2f2")
image_canvas.grid(row=2, column=3, padx=10, pady=(0, 20), sticky=tk.NW)  # Ensure it's right under the label

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in entries:
        entry.delete(0, tk.END)
    
    # Clear the folder and image entries
    folder_entry.delete(0, tk.END)
    image_entry.delete(0, tk.END)
    
    # Reset the image display
    image_canvas.config(image='')  # Clear the image on the right side
    
    # Optionally clear the result box as well
    result_box.delete(1.0, tk.END)

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Function to display the image on the right side
def display_image(image_path):
    # Open the image file
    img = Image.open(image_path)
    
    # Define the maximum width and height for the image
    max_width = 250  # You can adjust this value as needed
    max_height = 200  # You can adjust this value as needed
    
    # Resize the image while maintaining the aspect ratio
    img.thumbnail((max_width, max_height))  # Resize within the specified dimensions
    
    # Convert the image to a format Tkinter can handle
    img_tk = ImageTk.PhotoImage(img)
    
    # Update the image on the canvas
    image_canvas.config(image=img_tk)
    image_canvas.image = img_tk  # Store a reference to avoid garbage collection

# Function for handling the drop of files (drag-and-drop)
def drop(event):
    image_path = event.data  # Get the dropped file path
    
    # Clean up the file path
    if image_path.startswith('{') and image_path.endswith('}'):
        image_path = image_path[1:-1]  # Remove the curly braces
    
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Enable drag-and-drop for the main window
window.drop_target_register(DND_FILES)  # Register the main window for drag-and-drop of files
window.dnd_bind('<<Drop>>', drop)  # Bind the drop event to the drop function

# Run the GUI loop
window.mainloop()

## Fetching Product Code

In [5]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinterdnd2 import DND_FILES, TkinterDnD  # Import the drag-and-drop library
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    size_pattern = r'\b(?:\d+(?:\.\d+)?(?:\s*x\s*\d+(?:\.\d+)?)?|NA)\b'
    name_pattern = r'\b[A-Z][a-zA-Z0-9\s\-()]*[a-zA-Z)]\b'
    productcode_pattern = r'\b[A-Za-z0-9\-]+(?:/[A-Za-z0-9]+)*(?:\s*\([A-Za-z0-9/\s\-]+\))?\b'
    sku_pattern = r'\b\d{12}\b'

    power = re.findall(power_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    cct = re.findall(cct_pattern, text)
    size = re.findall(size_pattern, text)
    name = re.findall(name_pattern, text)
    product_code = re.findall(productcode_pattern, text)
    sku = re.findall(sku_pattern, text)

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Size": sorted(set(size), key=lambda x: (len(x.split('x')), x)),  # Sort by dimensions
        "Name": sorted(set(name), key=str),
        "Product Code": sorted(set(product_code), key=str),
        "SKU": sorted(set(sku), key=str)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_similarity(input_data, pdf_data):
    total_similarity = 0.0
    num_fields = 0

    # Function to compare numerical values with tolerance
    def compare_with_tolerance(input_list, pdf_list, tolerance=5):
        if not input_list or not pdf_list:
            return 0.0
        max_similarity = 0.0
        threshold_met = False  # To track if any values match the tolerance criteria

        # Compare each input entry with every pdf entry
        for input_value in input_list:
            if input_value == '':
                continue
            try:
                input_value = float(input_value)
                for pdf_value in pdf_list:
                    pdf_value = float(pdf_value)
                    similarity = 1 - abs(input_value - pdf_value) / input_value

                    # Only consider values that are close enough based on tolerance
                    if similarity >= 1 - (tolerance / 100):
                        max_similarity = max(max_similarity, similarity)
                        threshold_met = True  # Mark if any value is within the threshold
            except ValueError:
                continue

        # If no values matched the tolerance, return 0 similarity
        if not threshold_met:
            return 0.0
        return max_similarity

    # Define specific tolerances for different fields
    tolerances = {
        "Power (W)": 2,  # Strict tolerance for Power (W)
        "Lumens": 5,
        "CCT (K)": 4,
        "Size": 6,
        "Name": 3,
        "Product Code": 1,
        "SKU": 7,
    }

    # Compare numerical fields
    numerical_fields = ["Power (W)", "Lumens", "Size", "SKU"]
    for field in numerical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            # Use the field-specific tolerance, default to 5% if not specified
            tolerance = tolerances.get(field, 5)
            similarity = compare_with_tolerance(input_data[field], pdf_data[field], tolerance=tolerance)
            total_similarity += similarity

    # Compare categorical fields using fuzzy matching
    categorical_fields = ["CCT (K)", "Name", "Product Code"]
    for field in categorical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            match_scores = []
            for item in input_data[field]:
                if item == '':
                    continue
                best_match = process.extractOne(item, pdf_data[field], scorer=fuzz.token_sort_ratio)
                if best_match:
                    similarity = best_match[1] / 100.0
                    match_scores.append(similarity)
            if match_scores:
                total_similarity += max(match_scores)
            else:
                total_similarity += 0.0

    # Avoid division by zero
    if num_fields > 0:
        average_similarity = total_similarity / num_fields
    else:
        average_similarity = 0.0

    return average_similarity

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Find Top 10 similar PDFs with improved progress bar
def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    pdf_scores = []
    progress_bar['maximum'] = 0  # Reset progress bar maximum

    start_time = time.time()

    # Use os.walk to traverse through all subdirectories
    for root, _, files in os.walk(folder_path):
        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)  # Update progress bar maximum

        for pdf_file in pdf_files:
            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            if input_image_path:
                images = extract_images_from_pdf(pdf_file)
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            pdf_scores.append((pdf_file, combined_score))
            progress_bar['value'] += 1

            # Calculate elapsed time and ETA
            elapsed_time = time.time() - start_time
            processed_pdfs = progress_bar['value']
            total_pdfs = progress_bar['maximum']
            
            # Avoid division by zero
            if processed_pdfs > 0:  
                estimated_time_per_pdf = elapsed_time / processed_pdfs
                remaining_pdfs = total_pdfs - processed_pdfs
                eta = remaining_pdfs * estimated_time_per_pdf

                # Format time to show minutes and seconds
                eta_minutes, eta_seconds = divmod(eta, 60)
                eta_formatted = f"{int(eta_minutes)} min {int(eta_seconds)} sec"

                # Update progress label with percentage and ETA
                percentage = (processed_pdfs / total_pdfs) * 100
                progress_label.config(text=f"Progress: {processed_pdfs}/{total_pdfs} ({percentage:.2f}%) | ETA: {eta_formatted}")
            
            window.update()

    # Sort and get top 10 results
    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    # Reset progress bar and label
    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)
    for i, (pdf_path, score) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        # Add hyperlink functionality for each PDF path
        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")
        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Lumens": [lumens_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Size": [size_entry.get()],
        "Name": [name_entry.get()],
        "Product Code": [product_code_entry.get()],
        "SKU": [sku_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = TkinterDnD.Tk()
window.title("Product Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Title Label for "Cross Validation Tool"
title_label_cv = tk.Label(window, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=2, pady=20, sticky="w")  # Align to the west (left)

# Folder selection
folder_label = tk.Label(window, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=1, column=0, padx=10, pady=10, sticky=tk.W)
folder_entry = tk.Entry(window, width=50, font=entry_font)
folder_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(window, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=1, column=2, padx=10, pady=10)

# Image selection
image_label = tk.Label(window, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=2, column=0, padx=10, pady=10, sticky=tk.W)
image_entry = tk.Entry(window, width=50, font=entry_font)
image_entry.grid(row=2, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(window, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=2, column=2, padx=10, pady=10)

# Input fields for specifications
labels = ["Power (W):", "Lumens (lm):", "CCT (K):", "Size:", "Name:", "Product Code:", "SKU:"]
entries = []

for i, label_text in enumerate(labels):
    label = tk.Label(window, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i + 3, column=0, padx=10, pady=5, sticky=tk.W)
    entry = tk.Entry(window, width=20, font=entry_font)  # Reduced the width from 50 to 20
    entry.grid(row=i + 3, column=1, padx=10, pady=5, sticky="ew")
    entries.append(entry)

power_entry, lumens_entry, cct_entry, size_entry, name_entry, product_code_entry, sku_entry = entries

# Create a frame for the progress bar and label
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=14, column=0, columnspan=3, padx=10, pady=10, sticky="ew")  # Place it below other elements

# Progress bar inside the frame
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=500, mode="determinate")  # Increased length
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)  # Align left

# Progress label inside the frame
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)  # Align left next to the progress bar

# Result box (scrollable) below the image
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=4, column=3, rowspan=7, padx=10, pady=10, sticky="nsew")
result_frame.grid_columnconfigure(0, weight=1)  # Allow the frame to expand with the window

result_box = Text(result_frame, height=15, width=80, font=entry_font, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

scrollbar = ttk.Scrollbar(result_frame, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Submit button (Find Top 10 PDFs)
submit_button = tk.Button(window, text="Find Top 10 PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=20)  # Set a reduced width for the button
submit_button.grid(row=13, column=3, padx=10, pady=10, sticky="ew")

# Add "Clear Sections" button
clear_button = tk.Button(window, text="Clear Sections", command=lambda: clear_sections(), **button_style)
clear_button.config(width=20)  # Set a reduced width for the button
clear_button.grid(row=14, column=3, padx=10, pady=10, sticky="ew")

# Image display (right side of the window)
image_display_label = tk.Label(window, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=1, column=3, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the right side
image_canvas = tk.Label(window, bg="#f2f2f2")
image_canvas.grid(row=2, column=3, padx=10, pady=(0, 20), sticky=tk.NW)  # Ensure it's right under the label

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in entries:
        entry.delete(0, tk.END)
    
    # Clear the folder and image entries
    folder_entry.delete(0, tk.END)
    image_entry.delete(0, tk.END)
    
    # Reset the image display
    image_canvas.config(image='')  # Clear the image on the right side
    
    # Optionally clear the result box as well
    result_box.delete(1.0, tk.END)

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Function to display the image on the right side
def display_image(image_path):
    # Open the image file
    img = Image.open(image_path)
    
    # Define the maximum width and height for the image
    max_width = 250  # You can adjust this value as needed
    max_height = 200  # You can adjust this value as needed
    
    # Resize the image while maintaining the aspect ratio
    img.thumbnail((max_width, max_height))  # Resize within the specified dimensions
    
    # Convert the image to a format Tkinter can handle
    img_tk = ImageTk.PhotoImage(img)
    
    # Update the image on the canvas
    image_canvas.config(image=img_tk)
    image_canvas.image = img_tk  # Store a reference to avoid garbage collection

# Function for handling the drop of files (drag-and-drop)
def drop(event):
    image_path = event.data  # Get the dropped file path
    
    # Clean up the file path
    if image_path.startswith('{') and image_path.endswith('}'):
        image_path = image_path[1:-1]  # Remove the curly braces
    
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Enable drag-and-drop for the main window
window.drop_target_register(DND_FILES)  # Register the main window for drag-and-drop of files
window.dnd_bind('<<Drop>>', drop)  # Bind the drop event to the drop function

# Run the GUI loop
window.mainloop()

## Stop Execution button and reset of progress bar on clearing sections.

In [11]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinterdnd2 import DND_FILES, TkinterDnD  # Import the drag-and-drop library
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    size_pattern = r'\b(?:\d+(?:\.\d+)?(?:\s*x\s*\d+(?:\.\d+)?)?|NA)\b'
    name_pattern = r'\b[A-Z][a-zA-Z0-9\s\-()]*[a-zA-Z)]\b'
    productcode_pattern = r'\b[A-Za-z0-9\-]+(?:/[A-Za-z0-9]+)*(?:\s*\([A-Za-z0-9/\s\-]+\))?\b'
    sku_pattern = r'\b\d{12}\b'

    power = re.findall(power_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    cct = re.findall(cct_pattern, text)
    size = re.findall(size_pattern, text)
    name = re.findall(name_pattern, text)
    product_code = re.findall(productcode_pattern, text)
    sku = re.findall(sku_pattern, text)

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Size": sorted(set(size), key=lambda x: (len(x.split('x')), x)),  # Sort by dimensions
        "Name": sorted(set(name), key=str),
        "Product Code": sorted(set(product_code), key=str),
        "SKU": sorted(set(sku), key=str)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_similarity(input_data, pdf_data):
    total_similarity = 0.0
    num_fields = 0

    # Function to compare numerical values with tolerance
    def compare_with_tolerance(input_list, pdf_list, tolerance=5):
        if not input_list or not pdf_list:
            return 0.0
        max_similarity = 0.0
        threshold_met = False  # To track if any values match the tolerance criteria

        # Compare each input entry with every pdf entry
        for input_value in input_list:
            if input_value == '':
                continue
            try:
                input_value = float(input_value)
                for pdf_value in pdf_list:
                    pdf_value = float(pdf_value)
                    similarity = 1 - abs(input_value - pdf_value) / input_value

                    # Only consider values that are close enough based on tolerance
                    if similarity >= 1 - (tolerance / 100):
                        max_similarity = max(max_similarity, similarity)
                        threshold_met = True  # Mark if any value is within the threshold
            except ValueError:
                continue

        # If no values matched the tolerance, return 0 similarity
        if not threshold_met:
            return 0.0
        return max_similarity

    # Define specific tolerances for different fields
    tolerances = {
        "Power (W)": 2,  # Strict tolerance for Power (W)
        "Lumens": 5,
        "CCT (K)": 4,
        "Size": 6,
        "Name": 3,
        "Product Code": 1,
        "SKU": 7,
    }

    # Compare numerical fields
    numerical_fields = ["Power (W)", "Lumens", "Size", "SKU"]
    for field in numerical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            # Use the field-specific tolerance, default to 5% if not specified
            tolerance = tolerances.get(field, 5)
            similarity = compare_with_tolerance(input_data[field], pdf_data[field], tolerance=tolerance)
            total_similarity += similarity

    # Compare categorical fields using fuzzy matching
    categorical_fields = ["CCT (K)", "Name", "Product Code"]
    for field in categorical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            match_scores = []
            for item in input_data[field]:
                if item == '':
                    continue
                best_match = process.extractOne(item, pdf_data[field], scorer=fuzz.token_sort_ratio)
                if best_match:
                    similarity = best_match[1] / 100.0
                    match_scores.append(similarity)
            if match_scores:
                total_similarity += max(match_scores)
            else:
                total_similarity += 0.0

    # Avoid division by zero
    if num_fields > 0:
        average_similarity = total_similarity / num_fields
    else:
        average_similarity = 0.0

    return average_similarity

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Global variable to control the stopping of the process
stop_flag = False

# Function to stop the execution
def stop_execution():
    global stop_flag
    stop_flag = True

# Modify the function to check for the stop flag
def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    global stop_flag
    stop_flag = False  # Reset the stop flag each time this function starts
    pdf_scores = []
    progress_bar['maximum'] = 0  # Reset progress bar maximum

    start_time = time.time()

    # Use os.walk to traverse through all subdirectories
    for root, _, files in os.walk(folder_path):
        if stop_flag:  # Check if the stop flag is set
            break

        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)  # Update progress bar maximum

        for pdf_file in pdf_files:
            if stop_flag:  # Check if the stop flag is set
                break

            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            if input_image_path:
                images = extract_images_from_pdf(pdf_file)
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    if stop_flag:  # Check if the stop flag is set
                        break

                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            pdf_scores.append((pdf_file, combined_score))
            progress_bar['value'] += 1

            # Calculate elapsed time and ETA
            elapsed_time = time.time() - start_time
            processed_pdfs = progress_bar['value']
            total_pdfs = progress_bar['maximum']

            if processed_pdfs > 0:
                estimated_time_per_pdf = elapsed_time / processed_pdfs
                remaining_pdfs = total_pdfs - processed_pdfs
                eta = remaining_pdfs * estimated_time_per_pdf

                eta_minutes, eta_seconds = divmod(eta, 60)
                eta_formatted = f"{int(eta_minutes)} min {int(eta_seconds)} sec"

                percentage = (processed_pdfs / total_pdfs) * 100
                progress_label.config(text=f"Progress: {processed_pdfs}/{total_pdfs} ({percentage:.2f}%) | ETA: {eta_formatted}")

            window.update()

    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)
    for i, (pdf_path, score) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")
        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Lumens": [lumens_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Size": [size_entry.get()],
        "Name": [name_entry.get()],
        "Product Code": [product_code_entry.get()],
        "SKU": [sku_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = TkinterDnD.Tk()
window.title("Product Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Title Label for "Cross Validation Tool"
title_label_cv = tk.Label(window, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=2, pady=20, sticky="w")  # Align to the west (left)

# Folder selection
folder_label = tk.Label(window, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=1, column=0, padx=10, pady=10, sticky=tk.W)
folder_entry = tk.Entry(window, width=50, font=entry_font)
folder_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(window, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=1, column=2, padx=10, pady=10)

# Image selection
image_label = tk.Label(window, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=2, column=0, padx=10, pady=10, sticky=tk.W)
image_entry = tk.Entry(window, width=50, font=entry_font)
image_entry.grid(row=2, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(window, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=2, column=2, padx=10, pady=10)

# Input fields for specifications
labels = ["Power (W):", "Lumens (lm):", "CCT (K):", "Size:", "Name:", "Product Code:", "SKU:"]
entries = []

for i, label_text in enumerate(labels):
    label = tk.Label(window, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i + 3, column=0, padx=10, pady=5, sticky=tk.W)
    entry = tk.Entry(window, width=20, font=entry_font)  # Reduced the width from 50 to 20
    entry.grid(row=i + 3, column=1, padx=10, pady=5, sticky="ew")
    entries.append(entry)

power_entry, lumens_entry, cct_entry, size_entry, name_entry, product_code_entry, sku_entry = entries

# Create a frame for the progress bar and label
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=14, column=0, columnspan=3, padx=10, pady=10, sticky="ew")  # Place it below other elements

# Progress bar inside the frame
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=500, mode="determinate")  # Increased length
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)  # Align left

# Progress label inside the frame
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)  # Align left next to the progress bar

# Result box (scrollable) below the image
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=4, column=3, rowspan=7, padx=10, pady=10, sticky="nsew")
result_frame.grid_columnconfigure(0, weight=1)  # Allow the frame to expand with the window

result_box = Text(result_frame, height=15, width=80, font=entry_font, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

scrollbar = ttk.Scrollbar(result_frame, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Submit button (Find Top 10 PDFs)
submit_button = tk.Button(window, text="Find Top 10 PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=20)  # Set a reduced width for the button
submit_button.grid(row=13, column=3, padx=10, pady=10, sticky="ew")

# Add "Clear Sections" button
clear_button = tk.Button(window, text="Clear Sections", command=clear_sections, **button_style)
clear_button.config(width=20)
clear_button.grid(row=14, column=3, padx=10, pady=10, sticky="ew")

# Add a stop button to the GUI
stop_button = tk.Button(window, text="Stop Execution", command=stop_execution, **button_style)
stop_button.config(width=20)
stop_button.grid(row=15, column=3, padx=10, pady=10, sticky="ew")

# Image display (right side of the window)
image_display_label = tk.Label(window, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=1, column=3, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the right side
image_canvas = tk.Label(window, bg="#f2f2f2")
image_canvas.grid(row=2, column=3, padx=10, pady=(0, 20), sticky=tk.NW)  # Ensure it's right under the label

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in entries:
        entry.delete(0, tk.END)
    
    # Clear the folder and image entries
    folder_entry.delete(0, tk.END)
    image_entry.delete(0, tk.END)
    
    # Reset the image display
    image_canvas.config(image='')  # Clear the image on the right side
    
    # Optionally clear the result box as well
    result_box.delete(1.0, tk.END)

    # Clear all the text boxes or any sections you want to clear
    result_box.delete(1.0, tk.END)
    progress_label.config(text="Progress: 0%")  # Reset progress label
    progress_bar['value'] = 0  # Reset the progress bar
    progress_bar['maximum'] = 0  # Reset progress bar's maximum value
    

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Function to display the image on the right side
def display_image(image_path):
    # Open the image file
    img = Image.open(image_path)
    
    # Define the maximum width and height for the image
    max_width = 250  # You can adjust this value as needed
    max_height = 200  # You can adjust this value as needed
    
    # Resize the image while maintaining the aspect ratio
    img.thumbnail((max_width, max_height))  # Resize within the specified dimensions
    
    # Convert the image to a format Tkinter can handle
    img_tk = ImageTk.PhotoImage(img)
    
    # Update the image on the canvas
    image_canvas.config(image=img_tk)
    image_canvas.image = img_tk  # Store a reference to avoid garbage collection

# Function for handling the drop of files (drag-and-drop)
def drop(event):
    image_path = event.data  # Get the dropped file path
    
    # Clean up the file path
    if image_path.startswith('{') and image_path.endswith('}'):
        image_path = image_path[1:-1]  # Remove the curly braces
    
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Enable drag-and-drop for the main window
window.drop_target_register(DND_FILES)  # Register the main window for drag-and-drop of files
window.dnd_bind('<<Drop>>', drop)  # Bind the drop event to the drop function

# Run the GUI loop
window.mainloop()

## Working on the GUI

In [204]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinterdnd2 import DND_FILES, TkinterDnD  # Import the drag-and-drop library
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    size_pattern = r'\b(?:\d+(?:\.\d+)?(?:\s*x\s*\d+(?:\.\d+)?)?|NA)\b'
    name_pattern = r'\b[A-Z][a-zA-Z0-9\s\-()]*[a-zA-Z)]\b'
    productcode_pattern = r'\b[A-Za-z0-9\-]+(?:/[A-Za-z0-9]+)*(?:\s*\([A-Za-z0-9/\s\-]+\))?\b'
    sku_pattern = r'\b\d{12}\b'

    power = re.findall(power_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    cct = re.findall(cct_pattern, text)
    size = re.findall(size_pattern, text)
    name = re.findall(name_pattern, text)
    product_code = re.findall(productcode_pattern, text)
    sku = re.findall(sku_pattern, text)

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Size": sorted(set(size), key=lambda x: (len(x.split('x')), x)),  # Sort by dimensions
        "Name": sorted(set(name), key=str),
        "Product Code": sorted(set(product_code), key=str),
        "SKU": sorted(set(sku), key=str)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_similarity(input_data, pdf_data):
    total_similarity = 0.0
    num_fields = 0

    # Function to compare numerical values with tolerance
    def compare_with_tolerance(input_list, pdf_list, tolerance=5):
        if not input_list or not pdf_list:
            return 0.0
        max_similarity = 0.0
        threshold_met = False  # To track if any values match the tolerance criteria

        # Compare each input entry with every pdf entry
        for input_value in input_list:
            if input_value == '':
                continue
            try:
                input_value = float(input_value)
                for pdf_value in pdf_list:
                    pdf_value = float(pdf_value)
                    similarity = 1 - abs(input_value - pdf_value) / input_value

                    # Only consider values that are close enough based on tolerance
                    if similarity >= 1 - (tolerance / 100):
                        max_similarity = max(max_similarity, similarity)
                        threshold_met = True  # Mark if any value is within the threshold
            except ValueError:
                continue

        # If no values matched the tolerance, return 0 similarity
        if not threshold_met:
            return 0.0
        return max_similarity

    # Define specific tolerances for different fields
    tolerances = {
        "Power (W)": 2,  # Strict tolerance for Power (W)
        "Lumens": 5,
        "CCT (K)": 4,
        "Size": 6,
        "Name": 3,
        "Product Code": 1,
        "SKU": 7,
    }

    # Compare numerical fields
    numerical_fields = ["Power (W)", "Lumens", "Size", "SKU"]
    for field in numerical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            # Use the field-specific tolerance, default to 5% if not specified
            tolerance = tolerances.get(field, 5)
            similarity = compare_with_tolerance(input_data[field], pdf_data[field], tolerance=tolerance)
            total_similarity += similarity

    # Compare categorical fields using fuzzy matching
    categorical_fields = ["CCT (K)", "Name", "Product Code"]
    for field in categorical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            match_scores = []
            for item in input_data[field]:
                if item == '':
                    continue
                best_match = process.extractOne(item, pdf_data[field], scorer=fuzz.token_sort_ratio)
                if best_match:
                    similarity = best_match[1] / 100.0
                    match_scores.append(similarity)
            if match_scores:
                total_similarity += max(match_scores)
            else:
                total_similarity += 0.0

    # Avoid division by zero
    if num_fields > 0:
        average_similarity = total_similarity / num_fields
    else:
        average_similarity = 0.0

    return average_similarity

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Global variable to control the stopping of the process
stop_flag = False

# Function to stop the execution
def stop_execution():
    global stop_flag
    stop_flag = True

# Modify the function to check for the stop flag
def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    global stop_flag
    stop_flag = False  # Reset the stop flag each time this function starts
    pdf_scores = []
    progress_bar['maximum'] = 0  # Reset progress bar maximum

    start_time = time.time()

    # Use os.walk to traverse through all subdirectories
    for root, _, files in os.walk(folder_path):
        if stop_flag:  # Check if the stop flag is set
            break

        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)  # Update progress bar maximum

        for pdf_file in pdf_files:
            if stop_flag:  # Check if the stop flag is set
                break

            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            if input_image_path:
                images = extract_images_from_pdf(pdf_file)
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    if stop_flag:  # Check if the stop flag is set
                        break

                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            pdf_scores.append((pdf_file, combined_score))
            progress_bar['value'] += 1

            # Calculate elapsed time and ETA
            elapsed_time = time.time() - start_time
            processed_pdfs = progress_bar['value']
            total_pdfs = progress_bar['maximum']

            if processed_pdfs > 0:
                estimated_time_per_pdf = elapsed_time / processed_pdfs
                remaining_pdfs = total_pdfs - processed_pdfs
                eta = remaining_pdfs * estimated_time_per_pdf

                eta_minutes, eta_seconds = divmod(eta, 60)
                eta_formatted = f"{int(eta_minutes)} min {int(eta_seconds)} sec"

                percentage = (processed_pdfs / total_pdfs) * 100
                progress_label.config(text=f"Progress: {processed_pdfs}/{total_pdfs} ({percentage:.2f}%) | ETA: {eta_formatted}")

            window.update()

    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)
    for i, (pdf_path, score) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")
        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Lumens": [lumens_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Size": [size_entry.get()],
        "Name": [name_entry.get()],
        "Product Code": [product_code_entry.get()],
        "SKU": [sku_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = TkinterDnD.Tk()
window.title("Product Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Frame for title labels
title_frame = tk.Frame(window, bg="#f2f2f2")
title_frame.grid(row=0, column=1, columnspan=2, pady=20, sticky="ew")

# Title Label for "Product Finder"
title_label_cv = tk.Label(title_frame, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=1, sticky="w")  # Align to the west (left)

# Frame for folder and image selection
selection_frame = tk.Frame(window, bg="#f2f2f2")
selection_frame.grid(row=1, column=0, columnspan=4, padx=455, pady=10, sticky="ew")

# Folder selection
folder_label = tk.Label(selection_frame, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=0, column=0, padx=(50, 10), pady=10, sticky=tk.W)  # Keep the padx as is
folder_entry = tk.Entry(selection_frame, width=50, font=entry_font)
folder_entry.grid(row=0, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(selection_frame, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=0, column=2, padx=(0, 10), pady=10)  # Remove padx to keep it right next to the entry

# Image selection
image_label = tk.Label(selection_frame, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=1, column=0, padx=(50, 10), pady=10, sticky=tk.W)  # Keep the padx as is
image_entry = tk.Entry(selection_frame, width=50, font=entry_font)
image_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(selection_frame, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=1, column=2, padx=(0, 10), pady=10)  # Remove padx to keep it right next to the entry

# Create a frame for the centered labels and entry fields
centered_input_frame = tk.Frame(window, bg="#f2f2f2")
centered_input_frame.grid(row=2, column=1, padx=10, pady=10, sticky="nsew")  # Position it in the grid

# Centered input fields for specifications
centered_labels = ["Power (W):", "Lumens (lm):", "CCT (K):", "Size:"]
centered_entries = []

for i, label_text in enumerate(centered_labels):
    label = tk.Label(centered_input_frame, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i, column=0, padx=(50, 10), pady=5, sticky="e")  # Increased left padx to 50 for more right shift
    entry = tk.Entry(centered_input_frame, width=20, font=entry_font)
    entry.grid(row=i, column=1, padx=10, pady=5, sticky="ew")  # Align entries to expand in the center
    centered_entries.append(entry)

# Create a frame for the right-aligned labels and entry fields
right_input_frame = tk.Frame(window, bg="#f2f2f2")
right_input_frame.grid(row=2, column=2, padx=10, pady=10, sticky="nsew")  # Position it in the grid

# Right-aligned input fields for specifications
right_labels = ["Name:", "Product Code:", "SKU:"]
right_entries = []

for i, label_text in enumerate(right_labels):
    label = tk.Label(right_input_frame, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i, column=0, padx=10, pady=5, sticky="e")  # Align labels to the east (right)
    entry = tk.Entry(right_input_frame, width=20, font=entry_font)
    entry.grid(row=i, column=1, padx=10, pady=5, sticky="ew")  # Align entries to expand in the center
    right_entries.append(entry)

# Accessing entries if needed
power_entry, lumens_entry, cct_entry, size_entry = centered_entries
name_entry, product_code_entry, sku_entry = right_entries

# Frame for image display (Fixed size to avoid layout shifts)
image_frame = tk.Frame(window, bg="#f2f2f2")
image_frame.grid(row=1, column=0, padx=10, pady=(0, 10), sticky=tk.NW)
image_display_label = tk.Label(image_frame, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=0, column=0, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the left side
image_canvas = tk.Label(image_frame, bg="#f2f2f2")
image_canvas.grid(row=1, column=0, padx=10, pady=(0, 20), sticky=tk.NW)  # Ensure it's right under the label, width=250, height=200)  # Set the width and height

# Create a frame for the progress bar
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=12, column=0, columnspan=4, padx=10, pady=(20, 10), sticky="ew")  # Adjusted pady
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=1100, mode="determinate")
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)

# Create a frame for the result box
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=13, column=0, columnspan=4, padx=10, pady=(10, 10), sticky="nsew")  # Adjusted pady
result_box = Text(result_frame, height=15, width=80, font=entry_font, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
scrollbar = ttk.Scrollbar(result_frame, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Create a frame for the buttons to keep them in a single row
button_frame = tk.Frame(window, bg="#f2f2f2")
button_frame.grid(row=14, column=0, columnspan=4, padx=10, pady=(10, 20), sticky="ew")  # Adjusted pady
submit_button = tk.Button(button_frame, text="Find Top 10 PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=57)  # Set a reduced width for the button
submit_button.pack(side=tk.LEFT, padx=5)  # Pack the button to the left side of the frame

# Add "Clear Sections" button
clear_button = tk.Button(button_frame, text="Clear Sections", command=clear_sections, **button_style)
clear_button.config(width=57)
clear_button.pack(side=tk.LEFT, padx=5)  # Pack the button next to the previous one

# Add a stop button to stop execution
stop_button = tk.Button(button_frame, text="Stop Execution", command=stop_execution, **button_style)
stop_button.config(width=57)
stop_button.pack(side=tk.LEFT, padx=5)  # Pack the button next to the previous one

# Make sure to define a minimum size for the window
window.minsize(1000, 600)  # Set a minimum size for the main window

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in centered_entries + right_entries:
        entry.delete(0, tk.END)
    
    # Clear the folder and image entries
    folder_entry.delete(0, tk.END)
    image_entry.delete(0, tk.END)
    
    # Reset the image display
    image_canvas.config(image='')  # Clear the image on the right side
    
    # Optionally clear the result box as well
    result_box.delete(1.0, tk.END)

    # Reset progress label and bar
    progress_label.config(text="Progress: 0%")  # Reset progress label
    progress_bar['value'] = 0  # Reset the progress bar
    progress_bar['maximum'] = 0  # Reset progress bar's maximum value

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

def display_image(image_path):
    try:
        # Open the image file
        img = Image.open(image_path)
        
        # Define the maximum width and height for the image
        max_width = 150
        max_height = 150
        
        # Resize the image while maintaining the aspect ratio
        img.thumbnail((max_width, max_height))
        
        # Convert the image to a format Tkinter can handle
        img_tk = ImageTk.PhotoImage(img)
        
        # Update the image on the canvas
        image_canvas.config(width=200, height=200)  # Ensure fixed dimensions
        image_canvas.config(image=img_tk)
        image_canvas.image = img_tk  # Store a reference to avoid garbage collection
    except Exception as e:
        print(f"Error opening image: {e}")

# Function for handling the drop of files (drag-and-drop)
def drop(event):
    image_path = event.data  # Get the dropped file path
    
    # Clean up the file path
    if image_path.startswith('{') and image_path.endswith('}'):
        image_path = image_path[1:-1]  # Remove the curly braces
    
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Enable drag-and-drop for the main window
window.drop_target_register(DND_FILES)  # Register the main window for drag-and-drop of files
window.dnd_bind('<<Drop>>', drop)  # Bind the drop event to the drop function

# Run the GUI loop
window.mainloop()

## The image resize issue handling:

In [18]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinterdnd2 import DND_FILES, TkinterDnD  # Import the drag-and-drop library
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    size_pattern = r'\b(?:\d+(?:\.\d+)?(?:\s*x\s*\d+(?:\.\d+)?)?|NA)\b'
    name_pattern = r'\b[A-Z][a-zA-Z0-9\s\-()]*[a-zA-Z)]\b'
    productcode_pattern = r'\b[A-Za-z0-9\-]+(?:/[A-Za-z0-9]+)*(?:\s*\([A-Za-z0-9/\s\-]+\))?\b'
    sku_pattern = r'\b\d{12}\b'

    power = re.findall(power_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    cct = re.findall(cct_pattern, text)
    size = re.findall(size_pattern, text)
    name = re.findall(name_pattern, text)
    product_code = re.findall(productcode_pattern, text)
    sku = re.findall(sku_pattern, text)

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Size": sorted(set(size), key=lambda x: (len(x.split('x')), x)),  # Sort by dimensions
        "Name": sorted(set(name), key=str),
        "Product Code": sorted(set(product_code), key=str),
        "SKU": sorted(set(sku), key=str)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_similarity(input_data, pdf_data):
    total_similarity = 0.0
    num_fields = 0

    # Function to compare numerical values with tolerance
    def compare_with_tolerance(input_list, pdf_list, tolerance=5):
        if not input_list or not pdf_list:
            return 0.0
        max_similarity = 0.0
        threshold_met = False  # To track if any values match the tolerance criteria

        # Compare each input entry with every pdf entry
        for input_value in input_list:
            if input_value == '':
                continue
            try:
                input_value = float(input_value)
                for pdf_value in pdf_list:
                    pdf_value = float(pdf_value)
                    similarity = 1 - abs(input_value - pdf_value) / input_value

                    # Only consider values that are close enough based on tolerance
                    if similarity >= 1 - (tolerance / 100):
                        max_similarity = max(max_similarity, similarity)
                        threshold_met = True  # Mark if any value is within the threshold
            except ValueError:
                continue

        # If no values matched the tolerance, return 0 similarity
        if not threshold_met:
            return 0.0
        return max_similarity

    # Define specific tolerances for different fields
    tolerances = {
        "Power (W)": 2,  # Strict tolerance for Power (W)
        "Lumens": 5,
        "CCT (K)": 4,
        "Size": 6,
        "Name": 3,
        "Product Code": 1,
        "SKU": 7,
    }

    # Compare numerical fields
    numerical_fields = ["Power (W)", "Lumens", "Size", "SKU"]
    for field in numerical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            # Use the field-specific tolerance, default to 5% if not specified
            tolerance = tolerances.get(field, 5)
            similarity = compare_with_tolerance(input_data[field], pdf_data[field], tolerance=tolerance)
            total_similarity += similarity

    # Compare categorical fields using fuzzy matching
    categorical_fields = ["CCT (K)", "Name", "Product Code"]
    for field in categorical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            match_scores = []
            for item in input_data[field]:
                if item == '':
                    continue
                best_match = process.extractOne(item, pdf_data[field], scorer=fuzz.token_sort_ratio)
                if best_match:
                    similarity = best_match[1] / 100.0
                    match_scores.append(similarity)
            if match_scores:
                total_similarity += max(match_scores)
            else:
                total_similarity += 0.0

    # Avoid division by zero
    if num_fields > 0:
        average_similarity = total_similarity / num_fields
    else:
        average_similarity = 0.0

    return average_similarity

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Global variable to control the stopping of the process
stop_flag = False

# Function to stop the execution
def stop_execution():
    global stop_flag
    stop_flag = True

# Modify the function to check for the stop flag
def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    global stop_flag
    stop_flag = False  # Reset the stop flag each time this function starts
    pdf_scores = []
    progress_bar['maximum'] = 0  # Reset progress bar maximum

    start_time = time.time()

    # Use os.walk to traverse through all subdirectories
    for root, _, files in os.walk(folder_path):
        if stop_flag:  # Check if the stop flag is set
            break

        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)  # Update progress bar maximum

        for pdf_file in pdf_files:
            if stop_flag:  # Check if the stop flag is set
                break

            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            if input_image_path:
                images = extract_images_from_pdf(pdf_file)
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    if stop_flag:  # Check if the stop flag is set
                        break

                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            pdf_scores.append((pdf_file, combined_score))
            progress_bar['value'] += 1

            # Calculate elapsed time and ETA
            elapsed_time = time.time() - start_time
            processed_pdfs = progress_bar['value']
            total_pdfs = progress_bar['maximum']

            if processed_pdfs > 0:
                estimated_time_per_pdf = elapsed_time / processed_pdfs
                remaining_pdfs = total_pdfs - processed_pdfs
                eta = remaining_pdfs * estimated_time_per_pdf

                eta_minutes, eta_seconds = divmod(eta, 60)
                eta_formatted = f"{int(eta_minutes)} min {int(eta_seconds)} sec"

                percentage = (processed_pdfs / total_pdfs) * 100
                progress_label.config(text=f"Progress: {processed_pdfs}/{total_pdfs} ({percentage:.2f}%) | ETA: {eta_formatted}")

            window.update()

    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)
    for i, (pdf_path, score) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")
        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Lumens": [lumens_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Size": [size_entry.get()],
        "Name": [name_entry.get()],
        "Product Code": [product_code_entry.get()],
        "SKU": [sku_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = TkinterDnD.Tk()
window.title("Product Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Frame for title labels
title_frame = tk.Frame(window, bg="#f2f2f2")
title_frame.grid(row=0, column=1, columnspan=2, pady=20, sticky="ew")

# Title Label for "Product Finder"
title_label_cv = tk.Label(title_frame, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=1, sticky="w")  # Align to the west (left)

# Frame for folder and image selection
selection_frame = tk.Frame(window, bg="#f2f2f2")
selection_frame.grid(row=1, column=0, columnspan=4, padx=455, pady=10, sticky="ew")

# Folder selection
folder_label = tk.Label(selection_frame, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=0, column=0, padx=(50, 10), pady=10, sticky=tk.W)  # Keep the padx as is
folder_entry = tk.Entry(selection_frame, width=50, font=entry_font)
folder_entry.grid(row=0, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(selection_frame, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=0, column=2, padx=(0, 10), pady=10)  # Remove padx to keep it right next to the entry

# Image selection
image_label = tk.Label(selection_frame, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=1, column=0, padx=(50, 10), pady=10, sticky=tk.W)  # Keep the padx as is
image_entry = tk.Entry(selection_frame, width=50, font=entry_font)
image_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(selection_frame, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=1, column=2, padx=(0, 10), pady=10)  # Remove padx to keep it right next to the entry

# Create a frame for the centered labels and entry fields
centered_input_frame = tk.Frame(window, bg="#f2f2f2")
centered_input_frame.grid(row=2, column=1, padx=10, pady=10, sticky="nsew")  # Position it in the grid

# Centered input fields for specifications
centered_labels = ["Power (W):", "Lumens (lm):", "CCT (K):", "Size:"]
centered_entries = []

for i, label_text in enumerate(centered_labels):
    label = tk.Label(centered_input_frame, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i, column=0, padx=(50, 10), pady=5, sticky="e")  # Increased left padx to 50 for more right shift
    entry = tk.Entry(centered_input_frame, width=20, font=entry_font)
    entry.grid(row=i, column=1, padx=10, pady=5, sticky="ew")  # Align entries to expand in the center
    centered_entries.append(entry)

# Create a frame for the right-aligned labels and entry fields
right_input_frame = tk.Frame(window, bg="#f2f2f2")
right_input_frame.grid(row=2, column=2, padx=10, pady=10, sticky="nsew")  # Position it in the grid

# Right-aligned input fields for specifications
right_labels = ["Name:", "Product Code:", "SKU:"]
right_entries = []

for i, label_text in enumerate(right_labels):
    label = tk.Label(right_input_frame, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i, column=0, padx=10, pady=5, sticky="e")  # Align labels to the east (right)
    entry = tk.Entry(right_input_frame, width=20, font=entry_font)
    entry.grid(row=i, column=1, padx=10, pady=5, sticky="ew")  # Align entries to expand in the center
    right_entries.append(entry)

# Accessing entries if needed
power_entry, lumens_entry, cct_entry, size_entry = centered_entries
name_entry, product_code_entry, sku_entry = right_entries

# Frame for image display (Fixed size to avoid layout shifts)
image_frame = tk.Frame(window, bg="#f2f2f2", width=180, height=180)  # Set fixed width and height for the frame
image_frame.grid(row=1, column=0, padx=10, pady=(0, 10), sticky=tk.NW)
image_frame.grid_propagate(False)  # Prevent frame from resizing based on content

# Image display label
image_display_label = tk.Label(image_frame, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=0, column=0, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the left side

# Fixed-size image canvas
image_canvas = tk.Label(image_frame, bg="#f2f2f2", width=180, height=180)  # Set fixed size for the canvas
image_canvas.grid(row=1, column=0, padx=10, pady=(0, 20), sticky=tk.NW)

# Create a frame for the progress bar
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=12, column=0, columnspan=4, padx=10, pady=(20, 10), sticky="ew")  # Adjusted pady
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=1100, mode="determinate")
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)

# Create a frame for the result box
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=13, column=0, columnspan=4, padx=10, pady=(10, 10), sticky="nsew")  # Adjusted pady
result_box = Text(result_frame, height=15, width=80, font=entry_font, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
scrollbar = ttk.Scrollbar(result_frame, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Create a frame for the buttons to keep them in a single row
button_frame = tk.Frame(window, bg="#f2f2f2")
button_frame.grid(row=14, column=0, columnspan=4, padx=10, pady=(10, 20), sticky="ew")  # Adjusted pady
submit_button = tk.Button(button_frame, text="Find Top 10 PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=57)  # Set a reduced width for the button
submit_button.pack(side=tk.LEFT, padx=5)  # Pack the button to the left side of the frame

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in centered_entries + right_entries:
        entry.delete(0, tk.END)
    
    # Clear the folder and image entries
    folder_entry.delete(0, tk.END)
    image_entry.delete(0, tk.END)
    
    # Reset the image display
    image_canvas.config(image='')  # Clear the image on the right side
    
    # Optionally clear the result box as well
    result_box.delete(1.0, tk.END)

    # Reset progress label and bar
    progress_label.config(text="Progress: 0%")  # Reset progress label
    progress_bar['value'] = 0  # Reset the progress bar
    progress_bar['maximum'] = 0  # Reset progress bar's maximum value

# Add "Clear Sections" button
clear_button = tk.Button(button_frame, text="Clear Sections", command=clear_sections, **button_style)
clear_button.config(width=57)
clear_button.pack(side=tk.LEFT, padx=5)  # Pack the button next to the previous one

# Add a stop button to stop execution
stop_button = tk.Button(button_frame, text="Stop Execution", command=stop_execution, **button_style)
stop_button.config(width=57)
stop_button.pack(side=tk.LEFT, padx=5)  # Pack the button next to the previous one

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

def display_image(image_path):
    try:
        # Open the image file
        img = Image.open(image_path)
        
        # Define the fixed width and height for the image display area
        fixed_width = 150
        fixed_height = 150
        
        # Resize the image to exactly match the fixed dimensions (ignoring aspect ratio)
        img_resized = img.resize((fixed_width, fixed_height))
        
        # Convert the image to a format Tkinter can handle
        img_tk = ImageTk.PhotoImage(img_resized)
        
        # Display the resized image in the fixed canvas without changing its size
        image_canvas.config(image=img_tk, width=fixed_width, height=fixed_height)
        image_canvas.image = img_tk  # Store a reference to avoid garbage collection
    except Exception as e:
        print(f"Error opening image: {e}")

# Function for handling the drop of files (drag-and-drop)
def drop(event):
    image_path = event.data  # Get the dropped file path
    
    # Clean up the file path
    if image_path.startswith('{') and image_path.endswith('}'):
        image_path = image_path[1:-1]  # Remove the curly braces
    
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Enable drag-and-drop for the main window
window.drop_target_register(DND_FILES)  # Register the main window for drag-and-drop of files
window.dnd_bind('<<Drop>>', drop)  # Bind the drop event to the drop function

# Run the GUI loop
window.mainloop()

## Showcasing only the PDF's that have similarity > 60% and showcasing just 1 when there is a 100% match.

In [14]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinterdnd2 import DND_FILES, TkinterDnD  # Import the drag-and-drop library
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    size_pattern = r'\b(?:\d+(?:\.\d+)?(?:\s*x\s*\d+(?:\.\d+)?)?|NA)\b'
    name_pattern = r'\b[A-Z][a-zA-Z0-9\s\-()]*[a-zA-Z)]\b'
    productcode_pattern = r'\b[A-Za-z0-9\-]+(?:/[A-Za-z0-9]+)*(?:\s*\([A-Za-z0-9/\s\-]+\))?\b'
    sku_pattern = r'\b\d{12}\b'

    power = re.findall(power_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    cct = re.findall(cct_pattern, text)
    size = re.findall(size_pattern, text)
    name = re.findall(name_pattern, text)
    product_code = re.findall(productcode_pattern, text)
    sku = re.findall(sku_pattern, text)

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Size": sorted(set(size), key=lambda x: (len(x.split('x')), x)),  # Sort by dimensions
        "Name": sorted(set(name), key=str),
        "Product Code": sorted(set(product_code), key=str),
        "SKU": sorted(set(sku), key=str)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_similarity(input_data, pdf_data):
    total_similarity = 0.0
    num_fields = 0

    # Function to compare numerical values with tolerance
    def compare_with_tolerance(input_list, pdf_list, tolerance=5):
        if not input_list or not pdf_list:
            return 0.0
        max_similarity = 0.0
        threshold_met = False  # To track if any values match the tolerance criteria

        # Compare each input entry with every pdf entry
        for input_value in input_list:
            if input_value == '':
                continue
            try:
                input_value = float(input_value)
                for pdf_value in pdf_list:
                    pdf_value = float(pdf_value)
                    similarity = 1 - abs(input_value - pdf_value) / input_value

                    # Only consider values that are close enough based on tolerance
                    if similarity >= 1 - (tolerance / 100):
                        max_similarity = max(max_similarity, similarity)
                        threshold_met = True  # Mark if any value is within the threshold
            except ValueError:
                continue

        # If no values matched the tolerance, return 0 similarity
        if not threshold_met:
            return 0.0
        return max_similarity

    # Define specific tolerances for different fields
    tolerances = {
        "Power (W)": 2,  # Strict tolerance for Power (W)
        "Lumens": 5,
        "CCT (K)": 4,
        "Size": 6,
        "Name": 3,
        "Product Code": 1,
        "SKU": 7,
    }

    # Compare numerical fields
    numerical_fields = ["Power (W)", "Lumens", "Size", "SKU"]
    for field in numerical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            # Use the field-specific tolerance, default to 5% if not specified
            tolerance = tolerances.get(field, 5)
            similarity = compare_with_tolerance(input_data[field], pdf_data[field], tolerance=tolerance)
            total_similarity += similarity

    # Compare categorical fields using fuzzy matching
    categorical_fields = ["CCT (K)", "Name", "Product Code"]
    for field in categorical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            match_scores = []
            for item in input_data[field]:
                if item == '':
                    continue
                best_match = process.extractOne(item, pdf_data[field], scorer=fuzz.token_sort_ratio)
                if best_match:
                    similarity = best_match[1] / 100.0
                    match_scores.append(similarity)
            if match_scores:
                total_similarity += max(match_scores)
            else:
                total_similarity += 0.0

    # Avoid division by zero
    if num_fields > 0:
        average_similarity = total_similarity / num_fields
    else:
        average_similarity = 0.0

    return average_similarity

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Global variable to control the stopping of the process
stop_flag = False

# Function to stop the execution
def stop_execution():
    global stop_flag
    stop_flag = True

def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    global stop_flag
    stop_flag = False  # Reset the stop flag each time this function starts
    pdf_scores = []
    exact_match_found = False  # Flag to track if a 100% match is found
    progress_bar['maximum'] = 0  # Reset progress bar maximum

    start_time = time.time()

    # Use os.walk to traverse through all subdirectories
    for root, _, files in os.walk(folder_path):
        if stop_flag or exact_match_found:  # Stop processing if an exact match is found
            break

        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)  # Update progress bar maximum

        for pdf_file in pdf_files:
            if stop_flag or exact_match_found:  # Stop processing if an exact match is found
                break

            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            if input_image_path:
                images = extract_images_from_pdf(pdf_file)
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    if stop_flag or exact_match_found:  # Stop processing if an exact match is found
                        break

                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            # Check for an exact match (100% similarity)
            if combined_score >= 1.0:  # Exact match found
                exact_match_found = True
                pdf_scores = [(pdf_file, combined_score)]  # Clear any previous results and store only this match
                break  # Stop further processing of PDFs

            # Only append the result if the combined score is greater than 0.6 (60%)
            elif combined_score > 0.6:
                pdf_scores.append((pdf_file, combined_score))

            progress_bar['value'] += 1

            # Calculate elapsed time and ETA
            elapsed_time = time.time() - start_time
            processed_pdfs = progress_bar['value']
            total_pdfs = progress_bar['maximum']

            if processed_pdfs > 0:
                estimated_time_per_pdf = elapsed_time / processed_pdfs
                remaining_pdfs = total_pdfs - processed_pdfs
                eta = remaining_pdfs * estimated_time_per_pdf

                eta_minutes, eta_seconds = divmod(eta, 60)
                eta_formatted = f"{int(eta_minutes)} min {int(eta_seconds)} sec"

                percentage = (processed_pdfs / total_pdfs) * 100
                progress_label.config(text=f"Progress: {processed_pdfs}/{total_pdfs} ({percentage:.2f}%) | ETA: {eta_formatted}")

            window.update()

    # Sort the results by score in descending order and take the top 10 (if no exact match is found)
    if not exact_match_found:
        pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    # Clear the result box and display only the results with a score higher than 60%, or the exact match if found
    result_box.delete(1.0, tk.END)
    if exact_match_found:
        pdf_path, score = pdf_scores[0]
        result_box.insert(tk.END, f"Exact Match Found: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        result_box.tag_configure("pdf_exact", foreground="green", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", "pdf_exact")
        result_box.tag_bind("pdf_exact", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))
    else:
        for i, (pdf_path, score) in enumerate(pdf_scores):
            result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
            result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
            result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")
            result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Lumens": [lumens_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Size": [size_entry.get()],
        "Name": [name_entry.get()],
        "Product Code": [product_code_entry.get()],
        "SKU": [sku_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = TkinterDnD.Tk()
window.title("Product Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Frame for title labels
title_frame = tk.Frame(window, bg="#f2f2f2")
title_frame.grid(row=0, column=1, columnspan=2, pady=20, sticky="ew")

# Title Label for "Product Finder"
title_label_cv = tk.Label(title_frame, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=1, sticky="w")  # Align to the west (left)

# Frame for folder and image selection
selection_frame = tk.Frame(window, bg="#f2f2f2")
selection_frame.grid(row=1, column=0, columnspan=4, padx=455, pady=10, sticky="ew")

# Folder selection
folder_label = tk.Label(selection_frame, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=0, column=0, padx=(50, 10), pady=10, sticky=tk.W)  # Keep the padx as is
folder_entry = tk.Entry(selection_frame, width=50, font=entry_font)
folder_entry.grid(row=0, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(selection_frame, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=0, column=2, padx=(0, 10), pady=10)  # Remove padx to keep it right next to the entry

# Image selection
image_label = tk.Label(selection_frame, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=1, column=0, padx=(50, 10), pady=10, sticky=tk.W)  # Keep the padx as is
image_entry = tk.Entry(selection_frame, width=50, font=entry_font)
image_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(selection_frame, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=1, column=2, padx=(0, 10), pady=10)  # Remove padx to keep it right next to the entry

# Create a frame for the centered labels and entry fields
centered_input_frame = tk.Frame(window, bg="#f2f2f2")
centered_input_frame.grid(row=2, column=1, padx=10, pady=10, sticky="nsew")  # Position it in the grid

# Centered input fields for specifications
centered_labels = ["Power (W):", "Lumens (lm):", "CCT (K):", "Size:"]
centered_entries = []

for i, label_text in enumerate(centered_labels):
    label = tk.Label(centered_input_frame, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i, column=0, padx=(50, 10), pady=5, sticky="e")  # Increased left padx to 50 for more right shift
    entry = tk.Entry(centered_input_frame, width=20, font=entry_font)
    entry.grid(row=i, column=1, padx=10, pady=5, sticky="ew")  # Align entries to expand in the center
    centered_entries.append(entry)

# Create a frame for the right-aligned labels and entry fields
right_input_frame = tk.Frame(window, bg="#f2f2f2")
right_input_frame.grid(row=2, column=2, padx=10, pady=10, sticky="nsew")  # Position it in the grid

# Right-aligned input fields for specifications
right_labels = ["Name:", "Product Code:", "SKU:"]
right_entries = []

for i, label_text in enumerate(right_labels):
    label = tk.Label(right_input_frame, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i, column=0, padx=10, pady=5, sticky="e")  # Align labels to the east (right)
    entry = tk.Entry(right_input_frame, width=20, font=entry_font)
    entry.grid(row=i, column=1, padx=10, pady=5, sticky="ew")  # Align entries to expand in the center
    right_entries.append(entry)

# Accessing entries if needed
power_entry, lumens_entry, cct_entry, size_entry = centered_entries
name_entry, product_code_entry, sku_entry = right_entries

# Frame for image display (Fixed size to avoid layout shifts)
image_frame = tk.Frame(window, bg="#f2f2f2", width=180, height=180)  # Set fixed width and height for the frame
image_frame.grid(row=1, column=0, padx=10, pady=(0, 10), sticky=tk.NW)
image_frame.grid_propagate(False)  # Prevent frame from resizing based on content

# Image display label
image_display_label = tk.Label(image_frame, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=0, column=0, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the left side

# Fixed-size image canvas
image_canvas = tk.Label(image_frame, bg="#f2f2f2", width=180, height=180)  # Set fixed size for the canvas
image_canvas.grid(row=1, column=0, padx=10, pady=(0, 20), sticky=tk.NW)

# Create a frame for the progress bar
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=12, column=0, columnspan=4, padx=10, pady=(20, 10), sticky="ew")  # Adjusted pady
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=1100, mode="determinate")
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)

# Create a frame for the result box
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=13, column=0, columnspan=4, padx=10, pady=(10, 10), sticky="nsew")  # Adjusted pady
result_box = Text(result_frame, height=15, width=80, font=entry_font, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
scrollbar = ttk.Scrollbar(result_frame, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Create a frame for the buttons to keep them in a single row
button_frame = tk.Frame(window, bg="#f2f2f2")
button_frame.grid(row=14, column=0, columnspan=4, padx=10, pady=(10, 20), sticky="ew")  # Adjusted pady
submit_button = tk.Button(button_frame, text="Find Top PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=57)  # Set a reduced width for the button
submit_button.pack(side=tk.LEFT, padx=5)  # Pack the button to the left side of the frame

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in centered_entries + right_entries:
        entry.delete(0, tk.END)
    
    # Clear the folder and image entries
    folder_entry.delete(0, tk.END)
    image_entry.delete(0, tk.END)
    
    # Reset the image display
    image_canvas.config(image='')  # Clear the image on the right side
    
    # Optionally clear the result box as well
    result_box.delete(1.0, tk.END)

    # Reset progress label and bar
    progress_label.config(text="Progress: 0%")  # Reset progress label
    progress_bar['value'] = 0  # Reset the progress bar
    progress_bar['maximum'] = 0  # Reset progress bar's maximum value

# Add "Clear Sections" button
clear_button = tk.Button(button_frame, text="Clear Sections", command=clear_sections, **button_style)
clear_button.config(width=57)
clear_button.pack(side=tk.LEFT, padx=5)  # Pack the button next to the previous one

# Add a stop button to stop execution
stop_button = tk.Button(button_frame, text="Stop Execution", command=stop_execution, **button_style)
stop_button.config(width=57)
stop_button.pack(side=tk.LEFT, padx=5)  # Pack the button next to the previous one

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

def display_image(image_path):
    try:
        # Open the image file
        img = Image.open(image_path)
        
        # Define the fixed width and height for the image display area
        fixed_width = 150
        fixed_height = 150
        
        # Resize the image to exactly match the fixed dimensions (ignoring aspect ratio)
        img_resized = img.resize((fixed_width, fixed_height))
        
        # Convert the image to a format Tkinter can handle
        img_tk = ImageTk.PhotoImage(img_resized)
        
        # Display the resized image in the fixed canvas without changing its size
        image_canvas.config(image=img_tk, width=fixed_width, height=fixed_height)
        image_canvas.image = img_tk  # Store a reference to avoid garbage collection
    except Exception as e:
        print(f"Error opening image: {e}")

# Function for handling the drop of files (drag-and-drop)
def drop(event):
    image_path = event.data  # Get the dropped file path
    
    # Clean up the file path
    if image_path.startswith('{') and image_path.endswith('}'):
        image_path = image_path[1:-1]  # Remove the curly braces
    
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Enable drag-and-drop for the main window
window.drop_target_register(DND_FILES)  # Register the main window for drag-and-drop of files
window.dnd_bind('<<Drop>>', drop)  # Bind the drop event to the drop function

# Run the GUI loop
window.mainloop()

## Working on the Result Box

In [4]:
import os
import re
import time
import io
import concurrent.futures
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinterdnd2 import DND_FILES, TkinterDnD  # Import the drag-and-drop library
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = re.sub(r'\n+', ' ', text)  # Normalize newlines
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,3}(?:\.\d+)?)\s*W\b'
    lumens_pattern = r'(\d{1,6}(?:\.\d+)?)\s*lm(?!\/W)'
    cct_pattern = r'\b(\d{4})\s*K\b'
    voltage_pattern = r'\b(\d{1,3}(?:-\d{1,3})?)\s*V\b'  # Match voltage values like 220V or 120 V
    size_pattern = r'\b(?:\d+(?:\.\d+)?(?:\s*x\s*\d+(?:\.\d+)?)?|NA)\b'
    name_pattern = r'\b[A-Z][a-zA-Z0-9\s\-()]*[a-zA-Z)]\b'
    productcode_pattern = r'\b[A-Za-z0-9\-]+(?:/[A-Za-z0-9]+)*(?:\s*\([A-Za-z0-9/\s\-]+\))?\b'
    sku_pattern = r'\b\d{12}\b'

    power = re.findall(power_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    cct = re.findall(cct_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    size = re.findall(size_pattern, text)
    name = re.findall(name_pattern, text)
    product_code = re.findall(productcode_pattern, text)
    sku = re.findall(sku_pattern, text)

    # Convert power values to floats and sort them
    power_values = [float(p) for p in power]  # Extract the full power value

    # Handle voltage extraction and sorting
    def safe_float_conversion(voltage_string):
        try:
            # Check if it contains a range (like '0-10')
            if '-' in voltage_string:
                return voltage_string  # Keep as a string for ranges
            return float(voltage_string)
        except ValueError:
            return None  # Handle conversion errors by returning None

    voltage_values = [safe_float_conversion(v) for v in voltage]
    voltage_values = [v for v in voltage_values if v is not None]  # Filter out None values

    extracted_data = {
        "Power (W)": sorted(set(power_values), key=float),
        "Lumens": sorted(set(lumens), key=lambda x: float(x.replace(',', ''))),
        "CCT (K)": sorted(set(cct), key=int),
        "Voltage (V)": sorted(set(voltage_values), key=lambda x: float(x) if isinstance(x, (float, int)) else 0),  # Sort only numeric values
        "Size": sorted(set(size), key=lambda x: (len(x.split('x')), x)),
        "Name": sorted(set(name), key=str),
        "Product Code": sorted(set(product_code), key=str),
        "SKU": sorted(set(sku), key=str)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_similarity(input_data, pdf_data):
    total_similarity = 0.0
    num_fields = 0

    # Function to compare numerical values with tolerance
    def compare_with_tolerance(input_list, pdf_list, tolerance=5):
        if not input_list or not pdf_list:
            return 0.0
        max_similarity = 0.0
        threshold_met = False  # To track if any values match the tolerance criteria

        # Compare each input entry with every pdf entry
        for input_value in input_list:
            if input_value == '':
                continue
            try:
                input_value = float(input_value)
                for pdf_value in pdf_list:
                    pdf_value = float(pdf_value)
                    similarity = 1 - abs(input_value - pdf_value) / input_value

                    # Only consider values that are close enough based on tolerance
                    if similarity >= 1 - (tolerance / 100):
                        max_similarity = max(max_similarity, similarity)
                        threshold_met = True  # Mark if any value is within the threshold
            except ValueError:
                continue

        # If no values matched the tolerance, return 0 similarity
        if not threshold_met:
            return 0.0
        return max_similarity

    # Define specific tolerances for different fields
    tolerances = {
        "Power (W)": 2,  # Strict tolerance for Power (W)
        "Lumens": 5,
        "CCT (K)": 4,
        "Size": 6,
        "Name": 3,
        "Product Code": 1,
        "SKU": 7,
    }

    # Compare numerical fields
    numerical_fields = ["Power (W)", "Lumens", "Size", "SKU"]
    for field in numerical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            # Use the field-specific tolerance, default to 5% if not specified
            tolerance = tolerances.get(field, 5)
            similarity = compare_with_tolerance(input_data[field], pdf_data[field], tolerance=tolerance)
            total_similarity += similarity

    # Compare categorical fields using fuzzy matching
    categorical_fields = ["CCT (K)", "Name", "Product Code"]
    for field in categorical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            match_scores = []
            for item in input_data[field]:
                if item == '':
                    continue
                best_match = process.extractOne(item, pdf_data[field], scorer=fuzz.token_sort_ratio)
                if best_match:
                    similarity = best_match[1] / 100.0
                    match_scores.append(similarity)
            if match_scores:
                total_similarity += max(match_scores)
            else:
                total_similarity += 0.0

    # Avoid division by zero
    if num_fields > 0:
        average_similarity = total_similarity / num_fields
    else:
        average_similarity = 0.0

    return average_similarity

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Global variable to control the stopping of the process
stop_flag = False

# Function to stop the execution
def stop_execution():
    global stop_flag
    stop_flag = True

def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    pdf_scores = []
    progress_bar['maximum'] = 0  # Reset progress bar maximum
    start_time = time.time()

    # Traverse through PDF files
    for root, _, files in os.walk(folder_path):
        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)

        for pdf_file in pdf_files:
            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            # Extract additional specifications directly for the result box
            lumens = ', '.join(extracted_data.get("Lumens", []))  # Join directly without sorting or set
            power = ', '.join(str(p) for p in sorted(set(extracted_data.get("Power (W)", [])), key=float))
            cct = ', '.join(sorted(set(extracted_data.get("CCT (K)", [])), key=int))
            # Convert voltage items to string before joining
            voltage = ', '.join(str(v) for v in extracted_data.get("Voltage (V)", []))  # Convert to string

            # Extract images from the PDF regardless of whether input_image_path is provided
            images = extract_images_from_pdf(pdf_file)

            if input_image_path:
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            # Append the first image from the PDF (if available) along with the score
            pdf_scores.append((pdf_file, combined_score, images[0] if images else None, lumens, power, cct, voltage))
            progress_bar['value'] += 1

            # Update progress and ETA
            elapsed_time = time.time() - start_time
            processed_pdfs = progress_bar['value']
            total_pdfs = progress_bar['maximum']

            if processed_pdfs > 0:
                estimated_time_per_pdf = elapsed_time / processed_pdfs
                remaining_pdfs = total_pdfs - processed_pdfs
                eta = remaining_pdfs * estimated_time_per_pdf
                eta_minutes, eta_seconds = divmod(eta, 60)
                eta_formatted = f"{int(eta_minutes)} min {int(eta_seconds)} sec"
                percentage = (processed_pdfs / total_pdfs) * 100
                progress_label.config(text=f"Progress: {processed_pdfs}/{total_pdfs} ({percentage:.2f}%) | ETA: {eta_formatted}")

            window.update()

    # Sort and select top 10 results
    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]
    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)

    # List to keep references to the images
    image_references = []

    for i, (pdf_path, score, image, lumens, power, cct, voltage) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} = {score:.2f}\n")

        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")

        # Always display the image from the resultant PDF (if available)
        if image is not None:
            # Resize the image before displaying
            desired_size = (100, 100)  # Set the desired dimensions (width, height)
            resized_image = image.resize(desired_size, Image.LANCZOS)  # Resize the image
            img_display = ImageTk.PhotoImage(resized_image)  # Create PhotoImage
            image_references.append(img_display)  # Keep a reference

            # Create a frame to hold both image and text
            frame = tk.Frame(result_box)

            # Create a label for the image
            img_label = tk.Label(frame, image=img_display)
            img_label.image = img_display  # Keep a reference to avoid garbage collection
            img_label.grid(row=0, column=0, padx=(10, 10))  # Place image on the left

            # Create a label for the Lumens, Power, CCT, and Voltage values
            text_label = tk.Label(frame, text=f"Lumens: {lumens if lumens else 'N/A'}\n"
                                              f"Power: {power if power else 'N/A'}\n"
                                              f"CCT: {cct if cct else 'N/A'}\n"
                                              f"Voltage: {voltage if voltage else 'N/A'}",
                                  justify=tk.LEFT, anchor='w')
            text_label.grid(row=0, column=1, padx=(10, 10))  # Place text on the right

            result_box.window_create(tk.END, window=frame)  # Add the frame with image and text to the result box
            result_box.insert(tk.END, "\n")  # Add a new line after the image and text

        # Bind click event to open the PDF
        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open_new(pdf_path)

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Lumens": [lumens_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Size": [size_entry.get()],
        "Name": [name_entry.get()],
        "Product Code": [product_code_entry.get()],
        "SKU": [sku_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = TkinterDnD.Tk()
window.title("Product Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Frame for title labels
title_frame = tk.Frame(window, bg="#f2f2f2")
title_frame.grid(row=0, column=1, columnspan=2, pady=20, sticky="ew")

# Title Label for "Product Finder"
title_label_cv = tk.Label(title_frame, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=1, sticky="w")  # Align to the west (left)

# Frame for folder and image selection
selection_frame = tk.Frame(window, bg="#f2f2f2")
selection_frame.grid(row=1, column=0, columnspan=4, padx=455, pady=10, sticky="ew")

# Folder selection
folder_label = tk.Label(selection_frame, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=0, column=0, padx=(50, 10), pady=10, sticky=tk.W)  # Keep the padx as is
folder_entry = tk.Entry(selection_frame, width=50, font=entry_font)
folder_entry.grid(row=0, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(selection_frame, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=0, column=2, padx=(0, 10), pady=10)  # Remove padx to keep it right next to the entry

# Image selection
image_label = tk.Label(selection_frame, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=1, column=0, padx=(50, 10), pady=10, sticky=tk.W)  # Keep the padx as is
image_entry = tk.Entry(selection_frame, width=50, font=entry_font)
image_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(selection_frame, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=1, column=2, padx=(0, 10), pady=10)  # Remove padx to keep it right next to the entry

# Create a frame for the centered labels and entry fields
centered_input_frame = tk.Frame(window, bg="#f2f2f2")
centered_input_frame.grid(row=2, column=1, padx=10, pady=10, sticky="nsew")  # Position it in the grid

# Centered input fields for specifications
centered_labels = ["Power (W):", "Lumens (lm):", "CCT (K):", "Size:"]
centered_entries = []

for i, label_text in enumerate(centered_labels):
    label = tk.Label(centered_input_frame, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i, column=0, padx=(50, 10), pady=5, sticky="e")  # Increased left padx to 50 for more right shift
    entry = tk.Entry(centered_input_frame, width=20, font=entry_font)
    entry.grid(row=i, column=1, padx=10, pady=5, sticky="ew")  # Align entries to expand in the center
    centered_entries.append(entry)

# Create a frame for the right-aligned labels and entry fields
right_input_frame = tk.Frame(window, bg="#f2f2f2")
right_input_frame.grid(row=2, column=2, padx=10, pady=10, sticky="nsew")  # Position it in the grid

# Right-aligned input fields for specifications
right_labels = ["Name:", "Product Code:", "SKU:"]
right_entries = []

for i, label_text in enumerate(right_labels):
    label = tk.Label(right_input_frame, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i, column=0, padx=10, pady=5, sticky="e")  # Align labels to the east (right)
    entry = tk.Entry(right_input_frame, width=20, font=entry_font)
    entry.grid(row=i, column=1, padx=10, pady=5, sticky="ew")  # Align entries to expand in the center
    right_entries.append(entry)

# Accessing entries if needed
power_entry, lumens_entry, cct_entry, size_entry = centered_entries
name_entry, product_code_entry, sku_entry = right_entries

# Frame for image display (Fixed size to avoid layout shifts)
image_frame = tk.Frame(window, bg="#f2f2f2", width=180, height=180)  # Set fixed width and height for the frame
image_frame.grid(row=1, column=0, padx=10, pady=(0, 10), sticky=tk.NW)
image_frame.grid_propagate(False)  # Prevent frame from resizing based on content

# Image display label
image_display_label = tk.Label(image_frame, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=0, column=0, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the left side

# Fixed-size image canvas
image_canvas = tk.Label(image_frame, bg="#f2f2f2", width=180, height=180)  # Set fixed size for the canvas
image_canvas.grid(row=1, column=0, padx=10, pady=(0, 20), sticky=tk.NW)

# Create a frame for the progress bar
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=12, column=0, columnspan=4, padx=10, pady=(20, 10), sticky="ew")  # Adjusted pady
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=1100, mode="determinate")
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)

# Create a frame for the result box and image display
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=13, column=0, columnspan=4, padx=10, pady=10, sticky="ew")

# Create a text widget for displaying PDF results and image previews
result_box = tk.Text(result_frame, height=10, width=100, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, padx=10, pady=10, fill=tk.BOTH, expand=True)

# Scrollbar for the result box
scrollbar = ttk.Scrollbar(result_frame, orient=tk.VERTICAL, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Create a frame for the buttons to keep them in a single row
button_frame = tk.Frame(window, bg="#f2f2f2")
button_frame.grid(row=14, column=0, columnspan=4, padx=10, pady=(10, 20), sticky="ew")  # Adjusted pady
submit_button = tk.Button(button_frame, text="Find Top PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=57)  # Set a reduced width for the button
submit_button.pack(side=tk.LEFT, padx=5)  # Pack the button to the left side of the frame

def display_image_in_result_box(image, text_widget):
    # Resize image to fit the result box
    image = image.resize((120, 120))  # You can adjust the size based on your preference
    img = ImageTk.PhotoImage(image)
    
    # Add the image to the text widget
    text_widget.image_create(tk.END, image=img)
    text_widget.insert(tk.END, "\n")  # Move cursor after the image
    text_widget.image_ref = img  # Keep a reference to avoid garbage collection

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in centered_entries + right_entries:
        entry.delete(0, tk.END)
    
    # Clear the folder and image entries
    folder_entry.delete(0, tk.END)
    image_entry.delete(0, tk.END)
    
    # Reset the image display
    image_canvas.config(image='')  # Clear the image on the right side
    
    # Optionally clear the result box as well
    result_box.delete(1.0, tk.END)

    # Reset progress label and bar
    progress_label.config(text="Progress: 0%")  # Reset progress label
    progress_bar['value'] = 0  # Reset the progress bar
    progress_bar['maximum'] = 0  # Reset progress bar's maximum value

# Add "Clear Sections" button
clear_button = tk.Button(button_frame, text="Clear Sections", command=clear_sections, **button_style)
clear_button.config(width=57)
clear_button.pack(side=tk.LEFT, padx=5)  # Pack the button next to the previous one

# Add a stop button to stop execution
stop_button = tk.Button(button_frame, text="Stop Execution", command=stop_execution, **button_style)
stop_button.config(width=57)
stop_button.pack(side=tk.LEFT, padx=5)  # Pack the button next to the previous one

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

def display_image(image_path):
    try:
        # Open the image file
        img = Image.open(image_path)
        
        # Define the fixed width and height for the image display area
        fixed_width = 150
        fixed_height = 150
        
        # Resize the image to exactly match the fixed dimensions (ignoring aspect ratio)
        img_resized = img.resize((fixed_width, fixed_height))
        
        # Convert the image to a format Tkinter can handle
        img_tk = ImageTk.PhotoImage(img_resized)
        
        # Display the resized image in the fixed canvas without changing its size
        image_canvas.config(image=img_tk, width=fixed_width, height=fixed_height)
        image_canvas.image = img_tk  # Store a reference to avoid garbage collection
    except Exception as e:
        print(f"Error opening image: {e}")

# Function for handling the drop of files (drag-and-drop)
def drop(event):
    image_path = event.data  # Get the dropped file path
    
    # Clean up the file path
    if image_path.startswith('{') and image_path.endswith('}'):
        image_path = image_path[1:-1]  # Remove the curly braces
    
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Enable drag-and-drop for the main window
window.drop_target_register(DND_FILES)  # Register the main window for drag-and-drop of files
window.dnd_bind('<<Drop>>', drop)  # Bind the drop event to the drop function

# Run the GUI loop
window.mainloop()

## Adding the Export to Excel part

In [5]:
import os
import re
import time
import pandas as pd
import io
import concurrent.futures
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinterdnd2 import DND_FILES, TkinterDnD  # Import the drag-and-drop library
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
import openpyxl
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = re.sub(r'\n+', ' ', text)  # Normalize newlines
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,3}(?:\.\d+)?)\s*W\b'
    lumens_pattern = r'(\d{1,6}(?:\.\d+)?)\s*lm(?!\/W)'
    cct_pattern = r'\b(\d{4})\s*K\b'
    voltage_pattern = r'\b(\d{1,3}(?:-\d{1,3})?)\s*V\b'  # Match voltage values like 220V or 120 V
    size_pattern = r'\b(?:\d+(?:\.\d+)?(?:\s*x\s*\d+(?:\.\d+)?)?|NA)\b'
    name_pattern = r'\b[A-Z][a-zA-Z0-9\s\-()]*[a-zA-Z)]\b'
    productcode_pattern = r'\b[A-Za-z0-9\-]+(?:/[A-Za-z0-9]+)*(?:\s*\([A-Za-z0-9/\s\-]+\))?\b'
    sku_pattern = r'\b\d{12}\b'

    power = re.findall(power_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    cct = re.findall(cct_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    size = re.findall(size_pattern, text)
    name = re.findall(name_pattern, text)
    product_code = re.findall(productcode_pattern, text)
    sku = re.findall(sku_pattern, text)

    # Convert power values to floats and sort them
    power_values = [float(p) for p in power]  # Extract the full power value

    # Handle voltage extraction and sorting
    def safe_float_conversion(voltage_string):
        try:
            # Check if it contains a range (like '0-10')
            if '-' in voltage_string:
                return voltage_string  # Keep as a string for ranges
            return float(voltage_string)
        except ValueError:
            return None  # Handle conversion errors by returning None

    voltage_values = [safe_float_conversion(v) for v in voltage]
    voltage_values = [v for v in voltage_values if v is not None]  # Filter out None values

    extracted_data = {
        "Power (W)": sorted(set(power_values), key=float),
        "Lumens": sorted(set(lumens), key=lambda x: float(x.replace(',', ''))),
        "CCT (K)": sorted(set(cct), key=int),
        "Voltage (V)": sorted(set(voltage_values), key=lambda x: float(x) if isinstance(x, (float, int)) else 0),  # Sort only numeric values
        "Size": sorted(set(size), key=lambda x: (len(x.split('x')), x)),
        "Name": sorted(set(name), key=str),
        "Product Code": sorted(set(product_code), key=str),
        "SKU": sorted(set(sku), key=str)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_similarity(input_data, pdf_data):
    total_similarity = 0.0
    num_fields = 0

    # Function to compare numerical values with tolerance
    def compare_with_tolerance(input_list, pdf_list, tolerance=5):
        if not input_list or not pdf_list:
            return 0.0
        max_similarity = 0.0
        threshold_met = False  # To track if any values match the tolerance criteria

        # Compare each input entry with every pdf entry
        for input_value in input_list:
            if input_value == '':
                continue
            try:
                input_value = float(input_value)
                for pdf_value in pdf_list:
                    pdf_value = float(pdf_value)
                    similarity = 1 - abs(input_value - pdf_value) / input_value

                    # Only consider values that are close enough based on tolerance
                    if similarity >= 1 - (tolerance / 100):
                        max_similarity = max(max_similarity, similarity)
                        threshold_met = True  # Mark if any value is within the threshold
            except ValueError:
                continue

        # If no values matched the tolerance, return 0 similarity
        if not threshold_met:
            return 0.0
        return max_similarity

    # Define specific tolerances for different fields
    tolerances = {
        "Power (W)": 2,  # Strict tolerance for Power (W)
        "Lumens": 5,
        "CCT (K)": 4,
        "Size": 6,
        "Name": 3,
        "Product Code": 1,
        "SKU": 7,
    }

    # Compare numerical fields
    numerical_fields = ["Power (W)", "Lumens", "Size", "SKU"]
    for field in numerical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            # Use the field-specific tolerance, default to 5% if not specified
            tolerance = tolerances.get(field, 5)
            similarity = compare_with_tolerance(input_data[field], pdf_data[field], tolerance=tolerance)
            total_similarity += similarity

    # Compare categorical fields using fuzzy matching
    categorical_fields = ["CCT (K)", "Name", "Product Code"]
    for field in categorical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            match_scores = []
            for item in input_data[field]:
                if item == '':
                    continue
                best_match = process.extractOne(item, pdf_data[field], scorer=fuzz.token_sort_ratio)
                if best_match:
                    similarity = best_match[1] / 100.0
                    match_scores.append(similarity)
            if match_scores:
                total_similarity += max(match_scores)
            else:
                total_similarity += 0.0

    # Avoid division by zero
    if num_fields > 0:
        average_similarity = total_similarity / num_fields
    else:
        average_similarity = 0.0

    return average_similarity

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Global variable to control the stopping of the process
stop_flag = False

# Function to stop the execution
def stop_execution():
    global stop_flag
    stop_flag = True

# Define a global list to store the result data
results_for_export = []

def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    global results_for_export
    results_for_export = []  # Clear the previous results

    pdf_scores = []
    progress_bar['maximum'] = 0
    start_time = time.time()

    for root, _, files in os.walk(folder_path):
        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)

        for pdf_file in pdf_files:
            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            lumens = ', '.join(extracted_data.get("Lumens", []))
            power = ', '.join(str(p) for p in sorted(set(extracted_data.get("Power (W)", [])), key=float))
            cct = ', '.join(sorted(set(extracted_data.get("CCT (K)", [])), key=int))
            voltage = ', '.join(str(v) for v in extracted_data.get("Voltage (V)", []))

            images = extract_images_from_pdf(pdf_file)

            if input_image_path:
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            pdf_scores.append((pdf_file, combined_score, images[0] if images else None, lumens, power, cct, voltage))
            progress_bar['value'] += 1

            elapsed_time = time.time() - start_time
            processed_pdfs = progress_bar['value']
            total_pdfs = progress_bar['maximum']

            if processed_pdfs > 0:
                estimated_time_per_pdf = elapsed_time / processed_pdfs
                remaining_pdfs = total_pdfs - processed_pdfs
                eta = remaining_pdfs * estimated_time_per_pdf
                eta_minutes, eta_seconds = divmod(eta, 60)
                eta_formatted = f"{int(eta_minutes)} min {int(eta_seconds)} sec"
                percentage = (processed_pdfs / total_pdfs) * 100
                progress_label.config(text=f"Progress: {processed_pdfs}/{total_pdfs} ({percentage:.2f}%) | ETA: {eta_formatted}")

            window.update()

    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]
    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)
    image_references = []

    for i, (pdf_path, score, image, lumens, power, cct, voltage) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} = {score:.2f}\n")

        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")

        if image is not None:
            desired_size = (100, 100)
            resized_image = image.resize(desired_size, Image.LANCZOS)
            img_display = ImageTk.PhotoImage(resized_image)
            image_references.append(img_display)

            frame = tk.Frame(result_box)
            img_label = tk.Label(frame, image=img_display)
            img_label.image = img_display
            img_label.grid(row=0, column=0, padx=(10, 10))

            text_label = tk.Label(frame, text=f"Lumens: {lumens if lumens else 'N/A'}\n"
                                              f"Power: {power if power else 'N/A'}\n"
                                              f"CCT: {cct if cct else 'N/A'}\n"
                                              f"Voltage: {voltage if voltage else 'N/A'}",
                                  justify=tk.LEFT, anchor='w')
            text_label.grid(row=0, column=1, padx=(10, 10))

            result_box.window_create(tk.END, window=frame)
            result_box.insert(tk.END, "\n")

            # Store the result data for export
            results_for_export.append({
                "Light Name": os.path.basename(pdf_path),
                "Lumens": lumens,
                "Power": power,
                "CCT": cct,
                "Voltage": voltage,
                "PDF Link": pdf_path
            })

        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open_new(pdf_path)

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

# Function to browse and select the image
def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    if image_file:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_file)
        display_image(image_file)  # Call function to display image

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Lumens": [lumens_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Size": [size_entry.get()],
        "Name": [name_entry.get()],
        "Product Code": [product_code_entry.get()],
        "SKU": [sku_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = TkinterDnD.Tk()
window.title("Product Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Frame for title labels
title_frame = tk.Frame(window, bg="#f2f2f2")
title_frame.grid(row=0, column=1, columnspan=2, pady=20, sticky="ew")

# Title Label for "Product Finder"
title_label_cv = tk.Label(title_frame, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=1, sticky="w")  # Align to the west (left)

# Frame for folder and image selection
selection_frame = tk.Frame(window, bg="#f2f2f2")
selection_frame.grid(row=1, column=0, columnspan=4, padx=455, pady=10, sticky="ew")

# Folder selection
folder_label = tk.Label(selection_frame, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=0, column=0, padx=(50, 10), pady=10, sticky=tk.W)  # Keep the padx as is
folder_entry = tk.Entry(selection_frame, width=50, font=entry_font)
folder_entry.grid(row=0, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(selection_frame, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=0, column=2, padx=(0, 10), pady=10)  # Remove padx to keep it right next to the entry

# Image selection
image_label = tk.Label(selection_frame, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=1, column=0, padx=(50, 10), pady=10, sticky=tk.W)  # Keep the padx as is
image_entry = tk.Entry(selection_frame, width=50, font=entry_font)
image_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(selection_frame, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=1, column=2, padx=(0, 10), pady=10)  # Remove padx to keep it right next to the entry

# Create a frame for the centered labels and entry fields
centered_input_frame = tk.Frame(window, bg="#f2f2f2")
centered_input_frame.grid(row=2, column=1, padx=10, pady=10, sticky="nsew")  # Position it in the grid

# Centered input fields for specifications
centered_labels = ["Power (W):", "Lumens (lm):", "CCT (K):", "Size:"]
centered_entries = []

for i, label_text in enumerate(centered_labels):
    label = tk.Label(centered_input_frame, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i, column=0, padx=(50, 10), pady=5, sticky="e")  # Increased left padx to 50 for more right shift
    entry = tk.Entry(centered_input_frame, width=20, font=entry_font)
    entry.grid(row=i, column=1, padx=10, pady=5, sticky="ew")  # Align entries to expand in the center
    centered_entries.append(entry)

# Create a frame for the right-aligned labels and entry fields
right_input_frame = tk.Frame(window, bg="#f2f2f2")
right_input_frame.grid(row=2, column=2, padx=10, pady=10, sticky="nsew")  # Position it in the grid

# Right-aligned input fields for specifications
right_labels = ["Name:", "Product Code:", "SKU:"]
right_entries = []

for i, label_text in enumerate(right_labels):
    label = tk.Label(right_input_frame, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i, column=0, padx=10, pady=5, sticky="e")  # Align labels to the east (right)
    entry = tk.Entry(right_input_frame, width=20, font=entry_font)
    entry.grid(row=i, column=1, padx=10, pady=5, sticky="ew")  # Align entries to expand in the center
    right_entries.append(entry)

# Accessing entries if needed
power_entry, lumens_entry, cct_entry, size_entry = centered_entries
name_entry, product_code_entry, sku_entry = right_entries

# Frame for image display (Fixed size to avoid layout shifts)
image_frame = tk.Frame(window, bg="#f2f2f2", width=180, height=180)  # Set fixed width and height for the frame
image_frame.grid(row=1, column=0, padx=10, pady=(0, 10), sticky=tk.NW)
image_frame.grid_propagate(False)  # Prevent frame from resizing based on content

# Image display label
image_display_label = tk.Label(image_frame, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=0, column=0, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the left side

# Fixed-size image canvas
image_canvas = tk.Label(image_frame, bg="#f2f2f2", width=180, height=180)  # Set fixed size for the canvas
image_canvas.grid(row=1, column=0, padx=10, pady=(0, 20), sticky=tk.NW)

# Create a frame for the progress bar
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=12, column=0, columnspan=4, padx=10, pady=(20, 10), sticky="ew")  # Adjusted pady
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=1100, mode="determinate")
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)

# Create a frame for the result box and image display
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=13, column=0, columnspan=4, padx=10, pady=10, sticky="ew")

# Create a text widget for displaying PDF results and image previews
result_box = tk.Text(result_frame, height=10, width=100, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, padx=10, pady=10, fill=tk.BOTH, expand=True)

# Scrollbar for the result box
scrollbar = ttk.Scrollbar(result_frame, orient=tk.VERTICAL, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Create a frame for the buttons to keep them in a single row
button_frame = tk.Frame(window, bg="#f2f2f2")
button_frame.grid(row=14, column=0, columnspan=4, padx=10, pady=(10, 20), sticky="ew")  # Adjusted pady
submit_button = tk.Button(button_frame, text="Find Top PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=25)  # Set a reduced width for the button
submit_button.pack(side=tk.LEFT, padx=5)  # Pack the button to the left side of the frame

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in centered_entries + right_entries:
        entry.delete(0, tk.END)
    # Clear result box and image display
    result_box.delete(1.0, tk.END)
    image_canvas.config(image='')

# Function to export results to Excel
def export_to_excel():
    if not results_for_export:
        messagebox.showerror("Error", "No valid results found to export.")
        return

    # Prompt the user to save the Excel file
    file_path = filedialog.asksaveasfilename(defaultextension=".xlsx",
                                             filetypes=[("Excel files", "*.xlsx"), ("All files", "*.*")])

    if file_path:
        # Create a new Excel workbook and add a worksheet
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.title = "Top 10 Results"

        # Write the headers (including new "PDF Links" column)
        headers = ["Results", "Light Name", "Lumens", "Power", "CCT", "Voltage", "PDF Links"]
        sheet.append(headers)

        # Write the results to the Excel sheet
        for i, result in enumerate(results_for_export):
            print(f"Writing to Excel: {result}")
            sheet.append([
                f"Top {i + 1}",  # "Results"
                result["Light Name"],  # "Light Name"
                result["Lumens"],  # "Lumens"
                result["Power"],  # "Power"
                result["CCT"],  # "CCT"
                result["Voltage"],  # "Voltage"
                result["PDF Link"]  # "PDF Links" (new column for PDF link)
            ])

        # Save the workbook
        workbook.save(file_path)
        messagebox.showinfo("Success", f"Results exported to {file_path}")

# Function to extract values from the result box
def extract_values_from_result_box():
    lines = result_box.get("1.0", tk.END).splitlines()
    result_data = []

    i = 0
    while i < len(lines):
        if lines[i].startswith("Top"):
            light_name = lines[i + 1] if i + 1 < len(lines) else 'N/A'  # Extract the light name
            pdf_link = lines[i + 2] if i + 2 < len(lines) else 'N/A'    # Extract the PDF link

            # Extract lumens, power, CCT, and voltage if they exist, otherwise assign 'N/A'
            lumens = lines[i + 4].split(": ")[1] if i + 4 < len(lines) and "Lumens:" in lines[i + 4] else 'N/A'
            power = lines[i + 5].split(": ")[1] if i + 5 < len(lines) and "Power:" in lines[i + 5] else 'N/A'
            cct = lines[i + 6].split(": ")[1] if i + 6 < len(lines) and "CCT:" in lines[i + 6] else 'N/A'
            voltage = lines[i + 7].split(": ")[1] if i + 7 < len(lines) and "Voltage:" in lines[i + 7] else 'N/A'

            result_data.append((light_name, lumens, power, cct, voltage, pdf_link))
            i += 8  # Move to the next block of results
        else:
            i += 1

    return result_data

def display_image(image_path):
    try:
        # Open the image file
        img = Image.open(image_path)
        
        # Define the fixed width and height for the image display area
        fixed_width = 150
        fixed_height = 150
        
        # Resize the image to exactly match the fixed dimensions (ignoring aspect ratio)
        img_resized = img.resize((fixed_width, fixed_height))
        
        # Convert the image to a format Tkinter can handle
        img_tk = ImageTk.PhotoImage(img_resized)
        
        # Display the resized image in the fixed canvas without changing its size
        image_canvas.config(image=img_tk, width=fixed_width, height=fixed_height)
        image_canvas.image = img_tk  # Store a reference to avoid garbage collection
    except Exception as e:
        print(f"Error opening image: {e}")

# Function for handling the drop of files (drag-and-drop)
def drop(event):
    image_path = event.data  # Get the dropped file path
    
    # Clean up the file path
    if image_path.startswith('{') and image_path.endswith('}'):
        image_path = image_path[1:-1]  # Remove the curly braces
    
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Enable drag-and-drop for the main window
window.drop_target_register(DND_FILES)  # Register the main window for drag-and-drop of files
window.dnd_bind('<<Drop>>', drop)  # Bind the drop event to the drop function

clear_button = tk.Button(button_frame, text="Clear Sections", command=clear_sections, **button_style)
clear_button.config(width=15)  # Set a width for the button
clear_button.pack(side=tk.LEFT, padx=10)  # Added padding to separate the buttons

# Stop execution button
stop_button = tk.Button(button_frame, text="Stop Execution", command=lambda: stop_execution(), **button_style)
stop_button.config(width=15)  # Set a width for the button
stop_button.pack(side=tk.LEFT, padx=10)

# Add the "Export" button in the existing button_frame
export_button = tk.Button(button_frame, text="Export", command=export_to_excel, **button_style)
export_button.config(width=25)
export_button.pack(side=tk.LEFT, padx=5)  # Pack the button next to the stop button

# Main loop
window.mainloop()

## Adding Deviation

In [5]:
import os
import re
import time
import pandas as pd
import io
import concurrent.futures
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinterdnd2 import DND_FILES, TkinterDnD  # Import the drag-and-drop library
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
import openpyxl
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = re.sub(r'\n+', ' ', text)  # Normalize newlines
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,3}(?:\.\d+)?)\s*W\b'
    lumens_pattern = r'(\d{1,6}(?:\.\d+)?)\s*lm(?!\/W)'
    cct_pattern = r'\b(\d{4})\s*K\b'
    voltage_pattern = r'\b(\d{1,3}(?:-\d{1,3})?)\s*V\b'  # Match voltage values like 220V or 120 V
    size_pattern = r'\b(?:\d+(?:\.\d+)?(?:\s*x\s*\d+(?:\.\d+)?)?|NA)\b'
    name_pattern = r'\b[A-Z][a-zA-Z0-9\s\-()]*[a-zA-Z)]\b'
    productcode_pattern = r'\b[A-Za-z0-9\-]+(?:/[A-Za-z0-9]+)*(?:\s*\([A-Za-z0-9/\s\-]+\))?\b'
    sku_pattern = r'\b\d{12}\b'

    power = re.findall(power_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    cct = re.findall(cct_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    size = re.findall(size_pattern, text)
    name = re.findall(name_pattern, text)
    product_code = re.findall(productcode_pattern, text)
    sku = re.findall(sku_pattern, text)

    # Convert power values to floats and sort them
    power_values = [float(p) for p in power]  # Extract the full power value

    # Handle voltage extraction and sorting
    def safe_float_conversion(voltage_string):
        try:
            # Check if it contains a range (like '0-10')
            if '-' in voltage_string:
                return voltage_string  # Keep as a string for ranges
            return float(voltage_string)
        except ValueError:
            return None  # Handle conversion errors by returning None

    voltage_values = [safe_float_conversion(v) for v in voltage]
    voltage_values = [v for v in voltage_values if v is not None]  # Filter out None values

    extracted_data = {
        "Power (W)": sorted(set(power_values), key=float),
        "Lumens": sorted(set(lumens), key=lambda x: float(x.replace(',', ''))),
        "CCT (K)": sorted(set(cct), key=int),
        "Voltage (V)": sorted(set(voltage_values), key=lambda x: float(x) if isinstance(x, (float, int)) else 0),  # Sort only numeric values
        "Size": sorted(set(size), key=lambda x: (len(x.split('x')), x)),
        "Name": sorted(set(name), key=str),
        "Product Code": sorted(set(product_code), key=str),
        "SKU": sorted(set(sku), key=str)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

def calculate_similarity(input_data, pdf_data):
    total_similarity = 0.0
    num_fields = 0

    # Function to compare numerical values with tolerance
    def compare_with_tolerance(input_list, pdf_list, tolerance=5):
        if not input_list or not pdf_list:
            return 0.0
        max_similarity = 0.0
        threshold_met = False  # To track if any values match the tolerance criteria

        # Compare each input entry with every pdf entry
        for input_value in input_list:
            if input_value == '':
                continue
            try:
                input_value = float(input_value)
                for pdf_value in pdf_list:
                    pdf_value = float(pdf_value)
                    similarity = 1 - abs(input_value - pdf_value) / input_value

                    # Only consider values that are close enough based on tolerance
                    if similarity >= 1 - (tolerance / 100):
                        max_similarity = max(max_similarity, similarity)
                        threshold_met = True  # Mark if any value is within the threshold
            except ValueError:
                continue

        # If no values matched the tolerance, return 0 similarity
        if not threshold_met:
            return 0.0
        return max_similarity

    # Define specific tolerances for different fields
    tolerances = {
        "Power (W)": 2,  # Strict tolerance for Power (W)
        "Lumens": 5,
        "CCT (K)": 4,
        "Size": 6,
        "Name": 3,
        "Product Code": 1,
        "SKU": 7,
    }

    # Compare numerical fields
    numerical_fields = ["Power (W)", "Lumens", "Size", "SKU"]
    for field in numerical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            # Use the field-specific tolerance, default to 5% if not specified
            tolerance = tolerances.get(field, 5)
            similarity = compare_with_tolerance(input_data[field], pdf_data[field], tolerance=tolerance)
            total_similarity += similarity

    # Compare categorical fields using fuzzy matching
    categorical_fields = ["CCT (K)", "Name", "Product Code"]
    for field in categorical_fields:
        if input_data[field] and any(item != '' for item in input_data[field]):
            num_fields += 1
            match_scores = []
            for item in input_data[field]:
                if item == '':
                    continue
                best_match = process.extractOne(item, pdf_data[field], scorer=fuzz.token_sort_ratio)
                if best_match:
                    similarity = best_match[1] / 100.0
                    match_scores.append(similarity)
            if match_scores:
                total_similarity += max(match_scores)
            else:
                total_similarity += 0.0

    # Avoid division by zero
    if num_fields > 0:
        average_similarity = total_similarity / num_fields
    else:
        average_similarity = 0.0

    return average_similarity

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Global variable to control the stopping of the process
stop_flag = False

# Function to stop the execution
def stop_execution():
    global stop_flag
    stop_flag = True

# Define a global list to store the result data
results_for_export = []

def calculate_deviations(input_data, extracted_data):
    deviations = {}

    # Calculate deviation for numerical fields
    def calculate_numerical_deviation(input_value, extracted_value):
        if not input_value or not extracted_value:
            return None, None  # If one of the values is missing, skip it
        
        try:
            input_value = float(input_value)
            extracted_value = float(extracted_value)
            absolute_deviation = abs(input_value - extracted_value)
            percentage_deviation = (absolute_deviation / input_value) * 100
            return absolute_deviation, percentage_deviation
        except ValueError:
            return None, None

    # Calculate deviation for each numerical field
    for field in ["Power (W)", "Lumens", "CCT (K)", "Size"]:
        if input_data.get(field) and extracted_data.get(field):
            input_value = input_data[field][0]
            extracted_value = extracted_data[field][0] if extracted_data[field] else None

            absolute_dev, percentage_dev = calculate_numerical_deviation(input_value, extracted_value)
            deviations[field] = {
                "absolute_deviation": absolute_dev,
                "percentage_deviation": percentage_dev
            }
        else:
            deviations[field] = {
                "absolute_deviation": None,
                "percentage_deviation": None
            }

    # Calculate similarity for categorical fields
    for field in ["Name", "Product Code", "SKU"]:
        if input_data.get(field) and extracted_data.get(field):
            input_value = input_data[field][0]
            best_match = process.extractOne(input_value, extracted_data[field], scorer=fuzz.token_sort_ratio)
            if best_match:
                similarity = best_match[1] / 100.0  # Normalize the score to [0, 1]
                deviations[field] = {"similarity_score": similarity}
            else:
                deviations[field] = {"similarity_score": None}
        else:
            deviations[field] = {"similarity_score": None}

    return deviations

def find_top_10_similar_pdfs_with_deviation(folder_path, input_data, input_image_path=None):
    global results_for_export
    results_for_export = []  # Clear the previous results

    pdf_scores = []
    progress_bar['maximum'] = 0
    start_time = time.time()

    for root, _, files in os.walk(folder_path):
        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)

        for pdf_file in pdf_files:
            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            lumens = ', '.join(extracted_data.get("Lumens", []))
            power = ', '.join(str(p) for p in sorted(set(extracted_data.get("Power (W)", [])), key=float))
            cct = ', '.join(sorted(set(extracted_data.get("CCT (K)", [])), key=int))
            voltage = ', '.join(str(v) for v in extracted_data.get("Voltage (V)", []))

            # Extract images and compute image similarity
            images = extract_images_from_pdf(pdf_file)

            if input_image_path:
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            # Calculate deviations
            deviations = calculate_deviations(input_data, extracted_data)

            pdf_scores.append((pdf_file, combined_score, deviations, lumens, power, cct, voltage, images[0] if images else None))
            progress_bar['value'] += 1

            elapsed_time = time.time() - start_time
            processed_pdfs = progress_bar['value']
            total_pdfs = progress_bar['maximum']

            if processed_pdfs > 0:
                estimated_time_per_pdf = elapsed_time / processed_pdfs
                remaining_pdfs = total_pdfs - processed_pdfs
                eta = remaining_pdfs * estimated_time_per_pdf
                eta_minutes, eta_seconds = divmod(eta, 60)
                eta_formatted = f"{int(eta_minutes)} min {int(eta_seconds)} sec"
                percentage = (processed_pdfs / total_pdfs) * 100
                progress_label.config(text=f"Progress: {processed_pdfs}/{total_pdfs} ({percentage:.2f}%) | ETA: {eta_formatted}")

            window.update()

    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]
    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)
    image_references = []

    for i, (pdf_path, score, deviations, lumens, power, cct, voltage, image) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} = {score:.2f}\n")

        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")

        if image is not None:
            desired_size = (100, 100)
            resized_image = image.resize(desired_size, Image.LANCZOS)
            img_display = ImageTk.PhotoImage(resized_image)
            image_references.append(img_display)

            frame = tk.Frame(result_box)
            img_label = tk.Label(frame, image=img_display)
            img_label.image = img_display
            img_label.grid(row=0, column=0, padx=(10, 10))

            text_label = tk.Label(frame, text=f"Lumens: {lumens if lumens else 'N/A'}\n"
                                              f"Power: {power if power else 'N/A'}\n"
                                              f"CCT: {cct if cct else 'N/A'}\n"
                                              f"Voltage: {voltage if voltage else 'N/A'}",
                                    justify=tk.LEFT, anchor='w')
            text_label.grid(row=0, column=1, padx=(10, 10))

            result_box.window_create(tk.END, window=frame)
            result_box.insert(tk.END, "\n")

        # Display Deviations
        for field, dev_data in deviations.items():
            if "absolute_deviation" in dev_data:
                abs_dev = dev_data["absolute_deviation"]
                perc_dev = dev_data["percentage_deviation"]
                result_box.insert(tk.END, f"{field} Deviation: Abs: {abs_dev:.2f}, %: {perc_dev:.2f}%\n")
            elif "similarity_score" in dev_data:
                sim_score = dev_data["similarity_score"]
                result_box.insert(tk.END, f"{field} Similarity: {sim_score * 100:.2f}%\n")

        result_box.insert(tk.END, "\n")

        # Store the result data for export
        results_for_export.append({
            "Light Name": os.path.basename(pdf_path),
            "Lumens": lumens,
            "Power": power,
            "CCT": cct,
            "Voltage": voltage,
            "PDF Link": pdf_path,
            "Deviations": deviations  # Add deviations here for export
        })

    result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open_new(pdf_path)

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

# Function to browse and select the image
def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    if image_file:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_file)
        display_image(image_file)  # Call function to display image

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Lumens": [lumens_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Size": [size_entry.get()],
        "Name": [name_entry.get()],
        "Product Code": [product_code_entry.get()],
        "SKU": [sku_entry.get()],
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = TkinterDnD.Tk()
window.title("Product Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Frame for title labels
title_frame = tk.Frame(window, bg="#f2f2f2")
title_frame.grid(row=0, column=1, columnspan=2, pady=20, sticky="ew")

# Title Label for "Product Finder"
title_label_cv = tk.Label(title_frame, text="Product Finder", font=title_font, bg="#f2f2f2")
title_label_cv.grid(row=0, column=1, sticky="w")  # Align to the west (left)

# Frame for folder and image selection
selection_frame = tk.Frame(window, bg="#f2f2f2")
selection_frame.grid(row=1, column=0, columnspan=4, padx=455, pady=10, sticky="ew")

# Folder selection
folder_label = tk.Label(selection_frame, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=0, column=0, padx=(50, 10), pady=10, sticky=tk.W)  # Keep the padx as is
folder_entry = tk.Entry(selection_frame, width=50, font=entry_font)
folder_entry.grid(row=0, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(selection_frame, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=0, column=2, padx=(0, 10), pady=10)  # Remove padx to keep it right next to the entry

# Image selection
image_label = tk.Label(selection_frame, text="Input Image:", font=label_font, bg="#f2f2f2")
image_label.grid(row=1, column=0, padx=(50, 10), pady=10, sticky=tk.W)  # Keep the padx as is
image_entry = tk.Entry(selection_frame, width=50, font=entry_font)
image_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(selection_frame, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=1, column=2, padx=(0, 10), pady=10)  # Remove padx to keep it right next to the entry

# Create a frame for the centered labels and entry fields
centered_input_frame = tk.Frame(window, bg="#f2f2f2")
centered_input_frame.grid(row=2, column=1, padx=10, pady=10, sticky="nsew")  # Position it in the grid

# Centered input fields for specifications
centered_labels = ["Power (W):", "Lumens (lm):", "CCT (K):", "Size:"]
centered_entries = []

for i, label_text in enumerate(centered_labels):
    label = tk.Label(centered_input_frame, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i, column=0, padx=(50, 10), pady=5, sticky="e")  # Increased left padx to 50 for more right shift
    entry = tk.Entry(centered_input_frame, width=20, font=entry_font)
    entry.grid(row=i, column=1, padx=10, pady=5, sticky="ew")  # Align entries to expand in the center
    centered_entries.append(entry)

# Create a frame for the right-aligned labels and entry fields
right_input_frame = tk.Frame(window, bg="#f2f2f2")
right_input_frame.grid(row=2, column=2, padx=10, pady=10, sticky="nsew")  # Position it in the grid

# Right-aligned input fields for specifications
right_labels = ["Name:", "Product Code:", "SKU:"]
right_entries = []

for i, label_text in enumerate(right_labels):
    label = tk.Label(right_input_frame, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i, column=0, padx=10, pady=5, sticky="e")  # Align labels to the east (right)
    entry = tk.Entry(right_input_frame, width=20, font=entry_font)
    entry.grid(row=i, column=1, padx=10, pady=5, sticky="ew")  # Align entries to expand in the center
    right_entries.append(entry)

# Accessing entries if needed
power_entry, lumens_entry, cct_entry, size_entry = centered_entries
name_entry, product_code_entry, sku_entry = right_entries

# Frame for image display (Fixed size to avoid layout shifts)
image_frame = tk.Frame(window, bg="#f2f2f2", width=180, height=180)  # Set fixed width and height for the frame
image_frame.grid(row=1, column=0, padx=10, pady=(0, 10), sticky=tk.NW)
image_frame.grid_propagate(False)  # Prevent frame from resizing based on content

# Image display label
image_display_label = tk.Label(image_frame, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=0, column=0, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the left side

# Fixed-size image canvas
image_canvas = tk.Label(image_frame, bg="#f2f2f2", width=180, height=180)  # Set fixed size for the canvas
image_canvas.grid(row=1, column=0, padx=10, pady=(0, 20), sticky=tk.NW)

# Create a frame for the progress bar
progress_frame = tk.Frame(window, bg="#f2f2f2")
progress_frame.grid(row=12, column=0, columnspan=4, padx=10, pady=(20, 10), sticky="ew")  # Adjusted pady
progress_bar = ttk.Progressbar(progress_frame, orient="horizontal", length=1100, mode="determinate")
progress_bar.pack(side=tk.LEFT, padx=10, pady=10)
progress_label = tk.Label(progress_frame, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.pack(side=tk.LEFT, padx=10, pady=10)

# Create a frame for the result box and image display
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=13, column=0, columnspan=4, padx=10, pady=10, sticky="ew")

# Create a text widget for displaying PDF results and image previews
result_box = tk.Text(result_frame, height=10, width=100, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, padx=10, pady=10, fill=tk.BOTH, expand=True)

# Scrollbar for the result box
scrollbar = ttk.Scrollbar(result_frame, orient=tk.VERTICAL, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Create a frame for the buttons to keep them in a single row
button_frame = tk.Frame(window, bg="#f2f2f2")
button_frame.grid(row=14, column=0, columnspan=4, padx=10, pady=(10, 20), sticky="ew")  # Adjusted pady
submit_button = tk.Button(button_frame, text="Find Top PDF's", command=lambda: process_pdfs(), **button_style)
submit_button.config(width=25)  # Set a reduced width for the button
submit_button.pack(side=tk.LEFT, padx=5)  # Pack the button to the left side of the frame

# Function to clear all input fields and reset image display
def clear_sections():
    # Clear all input entries
    for entry in centered_entries + right_entries:
        entry.delete(0, tk.END)
    # Clear result box and image display
    result_box.delete(1.0, tk.END)
    image_canvas.config(image='')

# Function to export results to Excel
def export_to_excel():
    if not results_for_export:
        messagebox.showerror("Error", "No valid results found to export.")
        return

    # Prompt the user to save the Excel file
    file_path = filedialog.asksaveasfilename(defaultextension=".xlsx",
                                             filetypes=[("Excel files", "*.xlsx"), ("All files", "*.*")])

    if file_path:
        # Create a new Excel workbook and add a worksheet
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.title = "Top 10 Results"

        # Write the headers (including new "PDF Links" column)
        headers = ["Results", "Light Name", "Lumens", "Power", "CCT", "Voltage", "PDF Links"]
        sheet.append(headers)

        # Write the results to the Excel sheet
        for i, result in enumerate(results_for_export):
            print(f"Writing to Excel: {result}")
            sheet.append([
                f"Top {i + 1}",  # "Results"
                result["Light Name"],  # "Light Name"
                result["Lumens"],  # "Lumens"
                result["Power"],  # "Power"
                result["CCT"],  # "CCT"
                result["Voltage"],  # "Voltage"
                result["PDF Link"]  # "PDF Links" (new column for PDF link)
            ])

        # Save the workbook
        workbook.save(file_path)
        messagebox.showinfo("Success", f"Results exported to {file_path}")

# Function to extract values from the result box
def extract_values_from_result_box():
    lines = result_box.get("1.0", tk.END).splitlines()
    result_data = []

    i = 0
    while i < len(lines):
        if lines[i].startswith("Top"):
            light_name = lines[i + 1] if i + 1 < len(lines) else 'N/A'  # Extract the light name
            pdf_link = lines[i + 2] if i + 2 < len(lines) else 'N/A'    # Extract the PDF link

            # Extract lumens, power, CCT, and voltage if they exist, otherwise assign 'N/A'
            lumens = lines[i + 4].split(": ")[1] if i + 4 < len(lines) and "Lumens:" in lines[i + 4] else 'N/A'
            power = lines[i + 5].split(": ")[1] if i + 5 < len(lines) and "Power:" in lines[i + 5] else 'N/A'
            cct = lines[i + 6].split(": ")[1] if i + 6 < len(lines) and "CCT:" in lines[i + 6] else 'N/A'
            voltage = lines[i + 7].split(": ")[1] if i + 7 < len(lines) and "Voltage:" in lines[i + 7] else 'N/A'

            result_data.append((light_name, lumens, power, cct, voltage, pdf_link))
            i += 8  # Move to the next block of results
        else:
            i += 1

    return result_data

def display_image(image_path):
    try:
        # Open the image file
        img = Image.open(image_path)
        
        # Define the fixed width and height for the image display area
        fixed_width = 150
        fixed_height = 150
        
        # Resize the image to exactly match the fixed dimensions (ignoring aspect ratio)
        img_resized = img.resize((fixed_width, fixed_height))
        
        # Convert the image to a format Tkinter can handle
        img_tk = ImageTk.PhotoImage(img_resized)
        
        # Display the resized image in the fixed canvas without changing its size
        image_canvas.config(image=img_tk, width=fixed_width, height=fixed_height)
        image_canvas.image = img_tk  # Store a reference to avoid garbage collection
    except Exception as e:
        print(f"Error opening image: {e}")

# Function for handling the drop of files (drag-and-drop)
def drop(event):
    image_path = event.data  # Get the dropped file path
    
    # Clean up the file path
    if image_path.startswith('{') and image_path.endswith('}'):
        image_path = image_path[1:-1]  # Remove the curly braces
    
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Enable drag-and-drop for the main window
window.drop_target_register(DND_FILES)  # Register the main window for drag-and-drop of files
window.dnd_bind('<<Drop>>', drop)  # Bind the drop event to the drop function

clear_button = tk.Button(button_frame, text="Clear Sections", command=clear_sections, **button_style)
clear_button.config(width=15)  # Set a width for the button
clear_button.pack(side=tk.LEFT, padx=10)  # Added padding to separate the buttons

# Stop execution button
stop_button = tk.Button(button_frame, text="Stop Execution", command=lambda: stop_execution(), **button_style)
stop_button.config(width=15)  # Set a width for the button
stop_button.pack(side=tk.LEFT, padx=10)

# Add the "Export" button in the existing button_frame
export_button = tk.Button(button_frame, text="Export", command=export_to_excel, **button_style)
export_button.config(width=25)
export_button.pack(side=tk.LEFT, padx=5)  # Pack the button next to the stop button

# Main loop
window.mainloop()

Writing to Excel: {'Light Name': 'Delphi_PL_1x4FT_403020W_504035K_DLC_TDS.pdf', 'Lumens': '2500, 3750, 5000', 'Power': '18.0, 20.0, 30.0, 40.0', 'CCT': '3500, 4000, 5000', 'Voltage': '120-277', 'PDF Link': 'D:/Cross Search Automation/Previous Cross/IKIO Lights\\Delphi_PL_1x4FT_403020W_504035K_DLC_TDS.pdf'}
Writing to Excel: {'Light Name': 'Delphi_PL_2x2FT_403020W_504035K_DLC_TDS.pdf', 'Lumens': '2769, 4012.2, 5275', 'Power': '18.0, 20.0, 30.0, 40.0', 'CCT': '3500, 4000, 5000', 'Voltage': '120-277', 'PDF Link': 'D:/Cross Search Automation/Previous Cross/IKIO Lights\\Delphi_PL_2x2FT_403020W_504035K_DLC_TDS.pdf'}
Writing to Excel: {'Light Name': 'Delphi_PL_2x4FT_504030W_504035K_DLC_TDS.pdf', 'Lumens': '4122, 5215.2, 6443', 'Power': '18.0, 30.0, 40.0, 50.0', 'CCT': '3500, 4000, 5000', 'Voltage': '120-277', 'PDF Link': 'D:/Cross Search Automation/Previous Cross/IKIO Lights\\Delphi_PL_2x4FT_504030W_504035K_DLC_TDS.pdf'}
Writing to Excel: {'Light Name': 'Novus_FL_TDS.pdf', 'Lumens': '1875, 37