## Specific Text based Search

In [None]:
import PyPDF2
import re
import os

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:  # Only add text if extraction was successful
                text += page_text
    return clean_text(text)

# Step 2: Clean the extracted text by removing unnecessary formatting
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text.strip()

# Step 3: Extract specific data (Power, Voltage, Current, Lumens, Efficacy, CCT, Beam Angle) using regex
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'  # Matches 'X W', 'XX W', or 'XXX W' for Power
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'  # Matches 'XX-XXX V' or 'XX V'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'  # Matches 'X.XX A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'  # Matches 'XXX Lumens' or 'XXXXX Lumens'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'  # Matches 'XXX lm/W' for efficacy
    cct_pattern = r'\b(\d{4})\s*K\b'  # Matches 'XXXX K' (CCT in Kelvin)
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'  # Matches 'XX°' or 'XXX°' for beam angle

    # Extract data using regex
    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    # Process voltage to handle ranges (e.g., 120-277 V)
    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:  # Handles '120-277 V' case
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:  # Handles individual voltages like '277 V'
            voltage_ranges.append(volt[2])

    # Remove duplicates by converting lists to sets and then back to lists
    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int)
    }

    return extracted_data

# Step 4: Compare specifications for similarity
def calculate_similarity(input_data, pdf_data):
    score = 0
    
    # Check if any value from input matches the extracted data
    if any(power in pdf_data["Power (W)"] for power in input_data["Power (W)"]):
        score += 1
    if any(voltage in pdf_data["Voltage (V)"] for voltage in input_data["Voltage (V)"]):
        score += 1
    if any(current in pdf_data["Current (A)"] for current in input_data["Current (A)"]):
        score += 1
    if any(efficacy in pdf_data["Efficacy (lm/W)"] for efficacy in input_data["Efficacy (lm/W)"]):
        score += 1
    if any(cct in pdf_data["CCT (K)"] for cct in input_data["CCT (K)"]):
        score += 1
    
    return score

# Step 5: Process a single PDF
def process_single_pdf(pdf_path):
    pdf_text = extract_text_from_pdf(pdf_path)
    extracted_data = extract_specifications(pdf_text)
    return extracted_data

# Step 6: Process all PDFs in folder and find the most similar
def find_most_similar_pdf(folder_path, input_data):
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    best_match = None
    highest_score = -1
    
    for pdf_file in pdf_files:
        extracted_data = process_single_pdf(pdf_file)
        score = calculate_similarity(input_data, extracted_data)
        if score > highest_score:
            highest_score = score
            best_match = pdf_file
    
    return best_match, highest_score

# Step 7: User input and execution
def get_multiple_inputs(prompt):
    return input(prompt).strip().split(',')

input_data = {
    "Power (W)": get_multiple_inputs("Enter Power (W) values: "),
    "Voltage (V)": get_multiple_inputs("Enter Voltage (V) values: "),
    "Current (A)": get_multiple_inputs("Enter Current (A) values: "),
    "Efficacy (lm/W)": get_multiple_inputs("Enter Efficacy (lm/W) values: "),
    "CCT (K)": get_multiple_inputs("Enter CCT (K) values: ")
}

folder_path = input("Enter folder path: ").strip()

best_match, score = find_most_similar_pdf(folder_path, input_data)

if best_match:
    print(f"Most similar PDF: {best_match} with a score of {score}")
else:
    print("No similar PDFs found.")

In [None]:
import PyPDF2
import re
import os

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:  # Only add text if extraction was successful
                text += page_text
    return clean_text(text)

# Step 2: Clean the extracted text by removing unnecessary formatting
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text.strip()

# Step 3: Extract specific data (Power, Voltage, Current, Lumens, Efficacy, CCT, Beam Angle) using regex
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'  # Matches 'X W', 'XX W', or 'XXX W' for Power
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'  # Matches 'XX-XXX V' or 'XX V'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'  # Matches 'X.XX A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'  # Matches 'XXX Lumens' or 'XXXXX Lumens'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'  # Matches 'XXX lm/W' for efficacy
    cct_pattern = r'\b(\d{4})\s*K\b'  # Matches 'XXXX K' (CCT in Kelvin)
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'  # Matches 'XX°' or 'XXX°' for beam angle

    # Extract data using regex
    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    # Process voltage to handle ranges (e.g., 120-277 V)
    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:  # Handles '120-277 V' case
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:  # Handles individual voltages like '277 V'
            voltage_ranges.append(volt[2])

    # Remove duplicates by converting lists to sets and then back to lists
    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int)
    }

    return extracted_data

# Step 4: Compare specifications for similarity
def calculate_similarity(input_data, pdf_data):
    score = 0
    
    # Check if any value from input matches the extracted data
    if any(power in pdf_data["Power (W)"] for power in input_data["Power (W)"]):
        score += 1
    if any(voltage in pdf_data["Voltage (V)"] for voltage in input_data["Voltage (V)"]):
        score += 1
    if any(current in pdf_data["Current (A)"] for current in input_data["Current (A)"]):
        score += 1
    if any(efficacy in pdf_data["Efficacy (lm/W)"] for efficacy in input_data["Efficacy (lm/W)"]):
        score += 1
    if any(cct in pdf_data["CCT (K)"] for cct in input_data["CCT (K)"]):
        score += 1
    if any(lumen in pdf_data["Lumens"] for lumen in input_data["Lumens"]):
        score += 1
    if any(beam_angle in pdf_data["Beam Angles (°)"] for beam_angle in input_data["Beam Angles (°)"]):
        score += 1
    
    return score

# Step 5: Process a single PDF
def process_single_pdf(pdf_path):
    pdf_text = extract_text_from_pdf(pdf_path)
    extracted_data = extract_specifications(pdf_text)
    return extracted_data

# Step 6: Process all PDFs in folder and find the most similar
def find_most_similar_pdf(folder_path, input_data):
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    best_match = None
    highest_score = -1
    
    for pdf_file in pdf_files:
        extracted_data = process_single_pdf(pdf_file)
        score = calculate_similarity(input_data, extracted_data)
        if score > highest_score:
            highest_score = score
            best_match = pdf_file
    
    return best_match, highest_score

# Step 7: User input and execution
def get_multiple_inputs(prompt):
    return input(prompt).strip().split(',')

input_data = {
    "Power (W)": get_multiple_inputs("Enter Power (W) values: "),
    "Voltage (V)": get_multiple_inputs("Enter Voltage (V) values: "),
    "Current (A)": get_multiple_inputs("Enter Current (A) values: "),
    "Efficacy (lm/W)": get_multiple_inputs("Enter Efficacy (lm/W) values: "),
    "CCT (K)": get_multiple_inputs("Enter CCT (K) values: "),
    "Lumens": get_multiple_inputs("Enter Lumens values: "),
    "Beam Angles (°)": get_multiple_inputs("Enter Beam Angle values: ")
}

folder_path = input("Enter folder path: ").strip()

best_match, score = find_most_similar_pdf(folder_path, input_data)

if best_match:
    print(f"Most similar PDF: {best_match} with a score of {score}")
else:
    print("No similar PDFs found.")

In [None]:
import PyPDF2
import re
import os
from tqdm import tqdm  # Import tqdm for the progress bar

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:  # Only add text if extraction was successful
                text += page_text
    return clean_text(text)

# Step 2: Clean the extracted text by removing unnecessary formatting
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text.strip()

# Step 3: Extract specific data (Power, Voltage, Current, Lumens, Efficacy, CCT, Beam Angle) using regex
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'  # Matches 'X W', 'XX W', or 'XXX W' for Power
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'  # Matches 'XX-XXX V' or 'XX V'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'  # Matches 'X.XX A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'  # Matches 'XXX Lumens' or 'XXXXX Lumens'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'  # Matches 'XXX lm/W' for efficacy
    cct_pattern = r'\b(\d{4})\s*K\b'  # Matches 'XXXX K' (CCT in Kelvin)
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'  # Matches 'XX°' or 'XXX°' for beam angle

    # Extract data using regex
    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    # Process voltage to handle ranges (e.g., 120-277 V)
    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:  # Handles '120-277 V' case
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:  # Handles individual voltages like '277 V'
            voltage_ranges.append(volt[2])

    # Remove duplicates by converting lists to sets and then back to lists
    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int)
    }

    return extracted_data

# Step 4: Compare specifications for similarity
def calculate_similarity(input_data, pdf_data):
    score = 0
    
    # Check if any value from input matches the extracted data
    if any(power in pdf_data["Power (W)"] for power in input_data["Power (W)"]):
        score += 1
    if any(voltage in pdf_data["Voltage (V)"] for voltage in input_data["Voltage (V)"]):
        score += 1
    if any(current in pdf_data["Current (A)"] for current in input_data["Current (A)"]):
        score += 1
    if any(efficacy in pdf_data["Efficacy (lm/W)"] for efficacy in input_data["Efficacy (lm/W)"]):
        score += 1
    if any(cct in pdf_data["CCT (K)"] for cct in input_data["CCT (K)"]):
        score += 1
    if any(lumen in pdf_data["Lumens"] for lumen in input_data["Lumens"]):
        score += 1
    if any(beam_angle in pdf_data["Beam Angles (°)"] for beam_angle in input_data["Beam Angles (°)"]):
        score += 1
    
    return score

# Step 5: Process a single PDF
def process_single_pdf(pdf_path):
    pdf_text = extract_text_from_pdf(pdf_path)
    extracted_data = extract_specifications(pdf_text)
    return extracted_data

# Step 6: Process all PDFs in folder and find the most similar
def find_most_similar_pdf(folder_path, input_data):
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    best_match = None
    highest_score = -1
    
    # Initialize tqdm progress bar
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs", unit="file"):
        extracted_data = process_single_pdf(pdf_file)
        score = calculate_similarity(input_data, extracted_data)
        if score > highest_score:
            highest_score = score
            best_match = pdf_file
    
    return best_match, highest_score

# Step 7: User input and execution
def get_multiple_inputs(prompt):
    return input(prompt).strip().split(',')

input_data = {
    "Power (W)": get_multiple_inputs("Enter Power (W) values: "),
    "Voltage (V)": get_multiple_inputs("Enter Voltage (V) values: "),
    "Current (A)": get_multiple_inputs("Enter Current (A) values: "),
    "Efficacy (lm/W)": get_multiple_inputs("Enter Efficacy (lm/W) values: "),
    "CCT (K)": get_multiple_inputs("Enter CCT (K) values: "),
    "Lumens": get_multiple_inputs("Enter Lumens values: "),
    "Beam Angles (°)": get_multiple_inputs("Enter Beam Angle values: ")
}

folder_path = input("Enter folder path: ").strip()

best_match, score = find_most_similar_pdf(folder_path, input_data)

if best_match:
    print(f"Most similar PDF: {best_match}")
else:
    print("No similar PDFs found.")

## Top 10

In [None]:
import PyPDF2
import re
import os
from tqdm import tqdm  # Import tqdm for the progress bar

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:  # Only add text if extraction was successful
                text += page_text
    return clean_text(text)

# Step 2: Clean the extracted text by removing unnecessary formatting
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with single space
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text.strip()

# Step 3: Extract specific data (Power, Voltage, Current, Lumens, Efficacy, CCT, Beam Angle) using regex
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'  # Matches 'X W', 'XX W', or 'XXX W' for Power
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'  # Matches 'XX-XXX V' or 'XX V'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'  # Matches 'X.XX A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'  # Matches 'XXX Lumens' or 'XXXXX Lumens'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'  # Matches 'XXX lm/W' for efficacy
    cct_pattern = r'\b(\d{4})\s*K\b'  # Matches 'XXXX K' (CCT in Kelvin)
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'  # Matches 'XX°' or 'XXX°' for beam angle

    # Extract data using regex
    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    # Process voltage to handle ranges (e.g., 120-277 V)
    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:  # Handles '120-277 V' case
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:  # Handles individual voltages like '277 V'
            voltage_ranges.append(volt[2])

    # Remove duplicates by converting lists to sets and then back to lists
    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int)
    }

    return extracted_data

# Step 4: Compare specifications for similarity
def calculate_similarity(input_data, pdf_data):
    score = 0
    
    # Check if any value from input matches the extracted data
    if any(power in pdf_data["Power (W)"] for power in input_data["Power (W)"]):
        score += 1
    if any(voltage in pdf_data["Voltage (V)"] for voltage in input_data["Voltage (V)"]):
        score += 1
    if any(current in pdf_data["Current (A)"] for current in input_data["Current (A)"]):
        score += 1
    if any(efficacy in pdf_data["Efficacy (lm/W)"] for efficacy in input_data["Efficacy (lm/W)"]):
        score += 1
    if any(cct in pdf_data["CCT (K)"] for cct in input_data["CCT (K)"]):
        score += 1
    if any(lumen in pdf_data["Lumens"] for lumen in input_data["Lumens"]):
        score += 1
    if any(beam_angle in pdf_data["Beam Angles (°)"] for beam_angle in input_data["Beam Angles (°)"]):
        score += 1
    
    return score

# Step 5: Process a single PDF
def process_single_pdf(pdf_path):
    pdf_text = extract_text_from_pdf(pdf_path)
    extracted_data = extract_specifications(pdf_text)
    return extracted_data

# Step 6: Process all PDFs in folder and find the top 10 most similar
def find_top_10_similar_pdfs(folder_path, input_data):
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    pdf_scores = []
    
    # Initialize tqdm progress bar
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs", unit="file"):
        extracted_data = process_single_pdf(pdf_file)
        score = calculate_similarity(input_data, extracted_data)
        pdf_scores.append((pdf_file, score))
    
    # Sort the PDFs based on similarity score in descending order
    pdf_scores.sort(key=lambda x: x[1], reverse=True)
    
    # Return the top 10 PDFs and their scores
    return pdf_scores[:10]

# Step 7: User input and execution
def get_multiple_inputs(prompt):
    return input(prompt).strip().split(',')

input_data = {
    "Power (W)": get_multiple_inputs("Enter Power (W) values: "),
    "Voltage (V)": get_multiple_inputs("Enter Voltage (V) values: "),
    "Current (A)": get_multiple_inputs("Enter Current (A) values: "),
    "Efficacy (lm/W)": get_multiple_inputs("Enter Efficacy (lm/W) values: "),
    "CCT (K)": get_multiple_inputs("Enter CCT (K) values: "),
    "Lumens": get_multiple_inputs("Enter Lumens values: "),
    "Beam Angles (°)": get_multiple_inputs("Enter Beam Angle values: ")
}

folder_path = input("Enter folder path: ").strip()

# Find the top 10 similar PDFs
top_10_pdfs = find_top_10_similar_pdfs(folder_path, input_data)

if top_10_pdfs:
    print("Top 10 most similar PDFs:")
    for pdf, score in top_10_pdfs:
        print(f"PDF: {pdf} | Similarity Score: {score}")
else:
    print("No similar PDFs found.")

## GUI

In [1]:
import PyPDF2
import re
import os
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
import time  # Import time module for ETA calculation

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return clean_text(text)

# Step 2: Clean the extracted text by removing unnecessary formatting
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  
    text = re.sub(r'\n+', ' ', text)  
    text = re.sub(r'[^\x00-\x7F]+', '', text)  
    return text.strip()

# Step 3: Extract specific data using regex
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'

    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:
            voltage_ranges.append(volt[2])

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int)
    }

    return extracted_data

# Step 4: Compare specifications for similarity
def calculate_similarity(input_data, pdf_data):
    score = 0
    if any(power in pdf_data["Power (W)"] for power in input_data["Power (W)"]):
        score += 1
    if any(voltage in pdf_data["Voltage (V)"] for voltage in input_data["Voltage (V)"]):
        score += 1
    if any(current in pdf_data["Current (A)"] for current in input_data["Current (A)"]):
        score += 1
    if any(efficacy in pdf_data["Efficacy (lm/W)"] for efficacy in input_data["Efficacy (lm/W)"]):
        score += 1
    if any(cct in pdf_data["CCT (K)"] for cct in input_data["CCT (K)"]):
        score += 1
    if any(lumen in pdf_data["Lumens"] for lumen in input_data["Lumens"]):
        score += 1
    if any(beam_angle in pdf_data["Beam Angles (°)"] for beam_angle in input_data["Beam Angles (°)"]):
        score += 1
    return score

# Step 5: Process a single PDF
def process_single_pdf(pdf_path):
    pdf_text = extract_text_from_pdf(pdf_path)
    extracted_data = extract_specifications(pdf_text)
    return extracted_data

# Step 6: Process all PDFs in folder and find the top 10 most similar
def find_top_10_similar_pdfs(folder_path, input_data):
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
    pdf_scores = []
    progress_bar['maximum'] = len(pdf_files)

    start_time = time.time()  # Start time for ETA calculation

    for i, pdf_file in enumerate(pdf_files):
        extracted_data = process_single_pdf(pdf_file)
        score = calculate_similarity(input_data, extracted_data)
        pdf_scores.append((pdf_file, score))

        # Update progress bar and calculate percentage and ETA
        progress_bar['value'] = i + 1
        percent_complete = int((i + 1) / len(pdf_files) * 100)
        elapsed_time = time.time() - start_time
        remaining_time = (elapsed_time / (i + 1)) * (len(pdf_files) - (i + 1))
        eta_label.config(text=f"ETA: {int(remaining_time)}s")
        percentage_label.config(text=f"{percent_complete}%")

        root.update_idletasks()

    pdf_scores.sort(key=lambda x: x[1], reverse=True)
    return pdf_scores[:10]

# GUI Functionality
def browse_folder():
    folder = filedialog.askdirectory()
    folder_path_entry.delete(0, tk.END)
    folder_path_entry.insert(0, folder)

def process_pdfs():
    input_data = {
        "Power (W)": power_entry.get().split(','),
        "Voltage (V)": voltage_entry.get().split(','),
        "Current (A)": current_entry.get().split(','),
        "Efficacy (lm/W)": efficacy_entry.get().split(','),
        "CCT (K)": cct_entry.get().split(','),
        "Lumens": lumens_entry.get().split(','),
        "Beam Angles (°)": beam_angle_entry.get().split(',')
    }
    
    folder_path = folder_path_entry.get().strip()
    
    if not folder_path or not os.path.exists(folder_path):
        messagebox.showerror("Error", "Please select a valid folder path.")
        return
    
    result_text.delete(1.0, tk.END)
    top_10_pdfs = find_top_10_similar_pdfs(folder_path, input_data)
    
    if top_10_pdfs:
        result_text.insert(tk.END, "Top 10 most similar PDFs:\n")
        for pdf, score in top_10_pdfs:
            result_text.insert(tk.END, f"PDF: {pdf} | Similarity Score: {score}\n")
    else:
        result_text.insert(tk.END, "No similar PDFs found.")

# Create GUI window
root = tk.Tk()
root.title("PDF Similarity Finder")

# Input fields for specifications
input_frame = tk.Frame(root)
input_frame.pack(padx=10, pady=10, fill=tk.X)

tk.Label(input_frame, text="Power (W):").grid(row=0, column=0, sticky=tk.W)
power_entry = tk.Entry(input_frame)
power_entry.grid(row=0, column=1, padx=5, pady=5)

tk.Label(input_frame, text="Voltage (V):").grid(row=1, column=0, sticky=tk.W)
voltage_entry = tk.Entry(input_frame)
voltage_entry.grid(row=1, column=1, padx=5, pady=5)

tk.Label(input_frame, text="Current (A):").grid(row=2, column=0, sticky=tk.W)
current_entry = tk.Entry(input_frame)
current_entry.grid(row=2, column=1, padx=5, pady=5)

tk.Label(input_frame, text="Efficacy (lm/W):").grid(row=3, column=0, sticky=tk.W)
efficacy_entry = tk.Entry(input_frame)
efficacy_entry.grid(row=3, column=1, padx=5, pady=5)

tk.Label(input_frame, text="CCT (K):").grid(row=4, column=0, sticky=tk.W)
cct_entry = tk.Entry(input_frame)
cct_entry.grid(row=4, column=1, padx=5, pady=5)

tk.Label(input_frame, text="Lumens:").grid(row=5, column=0, sticky=tk.W)
lumens_entry = tk.Entry(input_frame)
lumens_entry.grid(row=5, column=1, padx=5, pady=5)

tk.Label(input_frame, text="Beam Angles (°):").grid(row=6, column=0, sticky=tk.W)
beam_angle_entry = tk.Entry(input_frame)
beam_angle_entry.grid(row=6, column=1, padx=5, pady=5)

# Folder selection
folder_frame = tk.Frame(root)
folder_frame.pack(padx=10, pady=10, fill=tk.X)

tk.Label(folder_frame, text="Folder Path:").grid(row=0, column=0, sticky=tk.W)
folder_path_entry = tk.Entry(folder_frame)
folder_path_entry.grid(row=0, column=1, padx=5, pady=5)

browse_button = tk.Button(folder_frame, text="Browse", command=browse_folder)
browse_button.grid(row=0, column=2, padx=5, pady=5)

# Progress Bar and ETA/Percentage labels
progress_bar = ttk.Progressbar(root, orient='horizontal', mode='determinate')
progress_bar.pack(padx=10, pady=10, fill=tk.X)

eta_label = tk.Label(root, text="ETA: 0s")
eta_label.pack(pady=5)

percentage_label = tk.Label(root, text="0%")
percentage_label.pack(pady=5)

# Start button
process_button = tk.Button(root, text="Process PDFs", command=process_pdfs)
process_button.pack(pady=10)

# Text box to display results
result_text = tk.Text(root, height=15)
result_text.pack(padx=10, pady=10, fill=tk.BOTH, expand=True)

root.mainloop()

## The Pre-Final Image and Text based Product Finder

This code is a Python script that implements a GUI-based application to find and rank PDFs based on their similarity to a given set of specifications and an optional input image. Here’s a breakdown of the main components and functionalities:

1. Imports and Model Initialization
The script imports various libraries, including:
OS, re, time, io: For file handling, regular expressions, timing, and byte input/output.
Tkinter: For creating the GUI.
PIL (Pillow): For image processing.
NumPy, PyTorch, and torchvision: For numerical operations and loading pre-trained models.
sklearn.metrics: For calculating similarity.
fitz (PyMuPDF): For working with PDF files.
Two pre-trained deep learning models (VGG19 and ResNet50) are loaded and set to evaluation mode.
2. Image Preprocessing
The preprocess variable defines a series of transformations to be applied to images before feeding them into the models. This includes resizing, converting to tensor, and normalizing.
3. PDF Text and Image Extraction
extract_text_from_pdf: Reads a PDF and extracts its text, cleaning it with clean_text.
clean_text: Cleans the extracted text by removing unnecessary whitespace and non-ASCII characters.
extract_specifications: Uses regex patterns to extract specific specifications (Power, Voltage, Current, etc.) from the cleaned text.
extract_images_from_pdf: Extracts images from each page of a PDF.
4. Feature Extraction from Images
extract_image_features: Combines features from VGG19, ResNet50, and HOG (Histogram of Oriented Gradients) for a given image.
extract_intermediate_features: Uses VGG19 to extract features from a specific layer.
extract_resnet_features: Uses ResNet50 to extract features.
extract_hog_features: Computes HOG features from an image.
5. Similarity Calculation
calculate_similarity: Compares input specifications with extracted PDF specifications and assigns a score based on matches.
calculate_image_similarity: Computes the cosine similarity between features of the input image and features of extracted images from PDFs.
6. Processing PDFs
find_top_10_similar_pdfs: Scans through all PDFs in a specified folder, extracts text and images, calculates similarity scores, and ranks the top 10 PDFs based on those scores. It also updates a progress bar and estimates the remaining time.
7. GUI Components
Browse Functions: Allows users to select a folder containing PDFs and an optional image file.
process_pdfs: Gathers input from the GUI, processes the PDFs using find_top_10_similar_pdfs, and displays the results in a text area.
The GUI is built using Tkinter, with labels, entry fields, buttons, a progress bar, and a text area for results.
Summary
Overall, this script is designed to help users find the most similar PDFs based on specified technical specifications and an optional input image, utilizing advanced image processing and machine learning techniques in a user-friendly GUI format.

In [1]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'

    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:
            voltage_ranges.append(volt[2])

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

# Calculate similarity scores
def calculate_similarity(input_data, pdf_data):
    score = 0
    if any(power in pdf_data["Power (W)"] for power in input_data["Power (W)"]):
        score += 1
    if any(voltage in pdf_data["Voltage (V)"] for voltage in input_data["Voltage (V)"]):
        score += 1
    if any(current in pdf_data["Current (A)"] for current in input_data["Current (A)"]):
        score += 1
    if any(efficacy in pdf_data["Efficacy (lm/W)"] for efficacy in input_data["Efficacy (lm/W)"]):
        score += 1
    if any(cct in pdf_data["CCT (K)"] for cct in input_data["CCT (K)"]):
        score += 1
    if any(lumen in pdf_data["Lumens"] for lumen in input_data["Lumens"]):
        score += 1
    if any(beam_angle in pdf_data["Beam Angles (°)"] for beam_angle in input_data["Beam Angles (°)"]):
        score += 1
    return score

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Process PDFs in folder
def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
    pdf_scores = []
    progress_bar['maximum'] = len(pdf_files)

    start_time = time.time()

    for i, pdf_file in enumerate(pdf_files):
        pdf_text = extract_text_from_pdf(pdf_file)
        extracted_data = extract_specifications(pdf_text)
        score = calculate_similarity(input_data, extracted_data)

        if input_image_path:
            extracted_images = extract_images_from_pdf(pdf_file)
            input_image = Image.open(input_image_path).convert('RGB')
            input_image_features = extract_image_features(input_image)
            
            for img in extracted_images:
                img_features = extract_image_features(img)
                image_similarity = calculate_image_similarity(input_image_features, img_features)
                combined_score = (0.7 * score) + (0.3 * image_similarity)
                pdf_scores.append((pdf_file, combined_score))

        else:
            pdf_scores.append((pdf_file, score))

        progress_bar['value'] = i + 1
        percent_complete = int((i + 1) / len(pdf_files) * 100)
        elapsed_time = time.time() - start_time
        remaining_time = (elapsed_time / (i + 1)) * (len(pdf_files) - (i + 1))
        eta_label.config(text=f"ETA: {int(remaining_time)}s")
        percentage_label.config(text=f"{percent_complete}%")
        root.update_idletasks()

    pdf_scores.sort(key=lambda x: x[1], reverse=True)
    return pdf_scores[:10]

# GUI Functions
def browse_folder():
    folder = filedialog.askdirectory()
    folder_path_entry.delete(0, tk.END)
    folder_path_entry.insert(0, folder)

def browse_image():
    file = filedialog.askopenfilename(filetypes=[("Image Files", "*.png;*.jpg;*.jpeg;*.bmp")])
    if file:
        image_path_entry.delete(0, tk.END)
        image_path_entry.insert(0, file)

def process_pdfs():
    folder_path = folder_path_entry.get()
    input_image_path = image_path_entry.get()
    
    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return
    
    input_data = {
        "Power (W)": power_entry.get().split(','),
        "Voltage (V)": voltage_entry.get().split(','),
        "Current (A)": current_entry.get().split(','),
        "Efficacy (lm/W)": efficacy_entry.get().split(','),
        "CCT (K)": cct_entry.get().split(','),
        "Lumens": lumens_entry.get().split(','),
        "Beam Angles (°)": beam_angle_entry.get().split(',')
    }

    top_pdfs = find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

    results_text.delete(1.0, tk.END)
    for pdf, score in top_pdfs:
        results_text.insert(tk.END, f"{pdf}: {score:.2f}\n")

# Create the main window
root = tk.Tk()
root.title("PDF Similarity Finder")

# Create input fields
tk.Label(root, text="Folder Path:").grid(row=0, column=0)
folder_path_entry = tk.Entry(root, width=50)
folder_path_entry.grid(row=0, column=1)
tk.Button(root, text="Browse", command=browse_folder).grid(row=0, column=2)

tk.Label(root, text="Input Image Path:").grid(row=1, column=0)
image_path_entry = tk.Entry(root, width=50)
image_path_entry.grid(row=1, column=1)
tk.Button(root, text="Browse", command=browse_image).grid(row=1, column=2)

# Create input fields for specifications
tk.Label(root, text="Power (W, comma-separated):").grid(row=2, column=0)
power_entry = tk.Entry(root)
power_entry.grid(row=2, column=1)

tk.Label(root, text="Voltage (V, comma-separated):").grid(row=3, column=0)
voltage_entry = tk.Entry(root)
voltage_entry.grid(row=3, column=1)

tk.Label(root, text="Current (A, comma-separated):").grid(row=4, column=0)
current_entry = tk.Entry(root)
current_entry.grid(row=4, column=1)

tk.Label(root, text="Efficacy (lm/W, comma-separated):").grid(row=5, column=0)
efficacy_entry = tk.Entry(root)
efficacy_entry.grid(row=5, column=1)

tk.Label(root, text="CCT (K, comma-separated):").grid(row=6, column=0)
cct_entry = tk.Entry(root)
cct_entry.grid(row=6, column=1)

tk.Label(root, text="Lumens (comma-separated):").grid(row=7, column=0)
lumens_entry = tk.Entry(root)
lumens_entry.grid(row=7, column=1)

tk.Label(root, text="Beam Angles (° comma-separated):").grid(row=8, column=0)
beam_angle_entry = tk.Entry(root)
beam_angle_entry.grid(row=8, column=1)

# Create buttons and progress indicators
tk.Button(root, text="Process PDFs", command=process_pdfs).grid(row=9, column=0, columnspan=3)

progress_bar = ttk.Progressbar(root, orient='horizontal', length=300, mode='determinate')
progress_bar.grid(row=10, column=0, columnspan=3)

eta_label = tk.Label(root, text="ETA: 0s")
eta_label.grid(row=11, column=0)

percentage_label = tk.Label(root, text="0%")
percentage_label.grid(row=11, column=1)

# Create text area for results
results_text = tk.Text(root, height=15, width=50)
results_text.grid(row=12, column=0, columnspan=3)

# Start the GUI main loop
root.mainloop()



Trying to fine tune

In [3]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'

    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:
            voltage_ranges.append(volt[2])

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

# Calculate similarity scores
def calculate_similarity(input_data, pdf_data):
    score = 0
    if any(power in pdf_data["Power (W)"] for power in input_data["Power (W)"]):
        score += 1
    if any(voltage in pdf_data["Voltage (V)"] for voltage in input_data["Voltage (V)"]):
        score += 1
    if any(current in pdf_data["Current (A)"] for current in input_data["Current (A)"]):
        score += 1
    if any(efficacy in pdf_data["Efficacy (lm/W)"] for efficacy in input_data["Efficacy (lm/W)"]):
        score += 1
    if any(cct in pdf_data["CCT (K)"] for cct in input_data["CCT (K)"]):
        score += 1
    if any(lumen in pdf_data["Lumens"] for lumen in input_data["Lumens"]):
        score += 1
    if any(beam_angle in pdf_data["Beam Angles (°)"] for beam_angle in input_data["Beam Angles (°)"]):
        score += 1
    
    # Fuzzy matching for more accuracy
    def fuzzy_match(input_list, pdf_list):
        match_score = 0
        for item in input_list:
            best_match = process.extractOne(item, pdf_list, scorer=fuzz.token_sort_ratio)
            if best_match and best_match[1] > 80:  # threshold for a match
                match_score += 1
        return match_score

    # Add fuzzy matching scores
    score += fuzzy_match(input_data["Power (W)"], pdf_data["Power (W)"])
    score += fuzzy_match(input_data["Voltage (V)"], pdf_data["Voltage (V)"])
    score += fuzzy_match(input_data["Current (A)"], pdf_data["Current (A)"])
    score += fuzzy_match(input_data["Efficacy (lm/W)"], pdf_data["Efficacy (lm/W)"])
    score += fuzzy_match(input_data["CCT (K)"], pdf_data["CCT (K)"])
    score += fuzzy_match(input_data["Lumens"], pdf_data["Lumens"])
    score += fuzzy_match(input_data["Beam Angles (°)"], pdf_data["Beam Angles (°)"])

    return score

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Process PDFs in folder
def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
    pdf_scores = []
    progress_bar['maximum'] = len(pdf_files)

    start_time = time.time()

    for i, pdf_file in enumerate(pdf_files):
        pdf_text = extract_text_from_pdf(pdf_file)
        extracted_data = extract_specifications(pdf_text)
        score = calculate_similarity(input_data, extracted_data)

        if input_image_path:
            images = extract_images_from_pdf(pdf_file)
            pdf_image_features = []
            for image in images:
                pdf_image_features.append(extract_image_features(image))
            input_image = Image.open(input_image_path).convert('RGB')
            input_image_features = extract_image_features(input_image)
            avg_pdf_similarity = np.mean([calculate_image_similarity(input_image_features, features) for features in pdf_image_features]) if pdf_image_features else 0.0
            score += avg_pdf_similarity
        
        pdf_scores.append((score, pdf_file))
        progress_bar['value'] = i + 1
        window.update_idletasks()

    pdf_scores.sort(key=lambda x: x[0], reverse=True)
    end_time = time.time()
    elapsed_time = end_time - start_time
    messagebox.showinfo("Processing Complete", f"Processed {len(pdf_files)} PDFs in {elapsed_time:.2f} seconds.")

    return pdf_scores[:10]

# GUI functions
def select_input_image():
    file_path = filedialog.askopenfilename(filetypes=[("Image Files", "*.jpg;*.jpeg;*.png")])
    if file_path:
        input_image_path.set(file_path)

def select_pdf_folder():
    folder_path = filedialog.askdirectory()
    if folder_path:
        pdf_folder_path.set(folder_path)

def run_similarity_search():
    input_data = {
        "Power (W)": power_input.get().split(','),
        "Voltage (V)": voltage_input.get().split(','),
        "Current (A)": current_input.get().split(','),
        "Efficacy (lm/W)": efficacy_input.get().split(','),
        "CCT (K)": cct_input.get().split(','),
        "Lumens": lumens_input.get().split(','),
        "Beam Angles (°)": beam_angle_input.get().split('°'),
    }

    input_image = input_image_path.get()
    folder_path = pdf_folder_path.get()

    top_pdfs = find_top_10_similar_pdfs(folder_path, input_data, input_image)

    results_text.delete(1.0, tk.END)  # Clear previous results
    for score, pdf_file in top_pdfs:
        results_text.insert(tk.END, f"Score: {score:.2f} - PDF: {pdf_file}\n")

# Create GUI
window = tk.Tk()
window.title("PDF Similarity Finder")
window.geometry("800x600")

pdf_folder_path = tk.StringVar()
input_image_path = tk.StringVar()

# Input Fields
tk.Label(window, text="Power (W):").pack()
power_input = tk.Entry(window)
power_input.pack()

tk.Label(window, text="Voltage (V):").pack()
voltage_input = tk.Entry(window)
voltage_input.pack()

tk.Label(window, text="Current (A):").pack()
current_input = tk.Entry(window)
current_input.pack()

tk.Label(window, text="Efficacy (lm/W):").pack()
efficacy_input = tk.Entry(window)
efficacy_input.pack()

tk.Label(window, text="CCT (K):").pack()
cct_input = tk.Entry(window)
cct_input.pack()

tk.Label(window, text="Lumens:").pack()
lumens_input = tk.Entry(window)
lumens_input.pack()

tk.Label(window, text="Beam Angles (°):").pack()
beam_angle_input = tk.Entry(window)
beam_angle_input.pack()

# Image selection
tk.Label(window, text="Select Input Image:").pack()
tk.Button(window, text="Browse", command=select_input_image).pack()

# PDF folder selection
tk.Label(window, text="Select PDF Folder:").pack()
tk.Button(window, text="Browse", command=select_pdf_folder).pack()

# Run button
tk.Button(window, text="Run Similarity Search", command=run_similarity_search).pack()

# Progress Bar
progress_bar = ttk.Progressbar(window, orient=tk.HORIZONTAL, length=300, mode='determinate')
progress_bar.pack()

# Results Text Box
results_text = tk.Text(window, wrap=tk.WORD, height=20)
results_text.pack()

# Start GUI loop
window.mainloop()



Adding the viewer for opening the pdf's on click.

In [None]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinter import Text
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'

    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:
            voltage_ranges.append(volt[2])

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

# Calculate similarity scores
def calculate_similarity(input_data, pdf_data):
    score = 0
    if any(power in pdf_data["Power (W)"] for power in input_data["Power (W)"]):
        score += 1
    if any(voltage in pdf_data["Voltage (V)"] for voltage in input_data["Voltage (V)"]):
        score += 1
    if any(current in pdf_data["Current (A)"] for current in input_data["Current (A)"]):
        score += 1
    if any(efficacy in pdf_data["Efficacy (lm/W)"] for efficacy in input_data["Efficacy (lm/W)"]):
        score += 1
    if any(cct in pdf_data["CCT (K)"] for cct in input_data["CCT (K)"]):
        score += 1
    if any(lumen in pdf_data["Lumens"] for lumen in input_data["Lumens"]):
        score += 1
    if any(beam_angle in pdf_data["Beam Angles (°)"] for beam_angle in input_data["Beam Angles (°)"]):
        score += 1
    
    # Fuzzy matching for more accuracy
    def fuzzy_match(input_list, pdf_list):
        match_score = 0
        for item in input_list:
            best_match = process.extractOne(item, pdf_list, scorer=fuzz.token_sort_ratio)
            if best_match and best_match[1] > 80:  # threshold for a match
                match_score += 1
        return match_score

    # Add fuzzy matching scores
    score += fuzzy_match(input_data["Power (W)"], pdf_data["Power (W)"])
    score += fuzzy_match(input_data["Voltage (V)"], pdf_data["Voltage (V)"])
    score += fuzzy_match(input_data["Current (A)"], pdf_data["Current (A)"])
    score += fuzzy_match(input_data["Efficacy (lm/W)"], pdf_data["Efficacy (lm/W)"])
    score += fuzzy_match(input_data["CCT (K)"], pdf_data["CCT (K)"])
    score += fuzzy_match(input_data["Lumens"], pdf_data["Lumens"])
    score += fuzzy_match(input_data["Beam Angles (°)"], pdf_data["Beam Angles (°)"])

    return score

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Process PDFs in folder and subfolders
def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    pdf_files = []
    for root, dirs, files in os.walk(folder_path):  # Use os.walk to get all subdirectories
        for file in files:
            if file.endswith('.pdf'):
                pdf_files.append(os.path.join(root, file))

    pdf_scores = []
    progress_bar['maximum'] = len(pdf_files)

    start_time = time.time()

    for i, pdf_file in enumerate(pdf_files):
        pdf_text = extract_text_from_pdf(pdf_file)
        extracted_data = extract_specifications(pdf_text)
        score = calculate_similarity(input_data, extracted_data)

        if input_image_path:
            images = extract_images_from_pdf(pdf_file)
            input_image = Image.open(input_image_path).convert('RGB')
            input_image_features = extract_image_features(input_image)

            max_image_similarity = 0
            for img in images:
                image_features = extract_image_features(img)
                image_similarity = calculate_image_similarity(input_image_features, image_features)
                max_image_similarity = max(max_image_similarity, image_similarity)

            combined_score = score + max_image_similarity
        else:
            combined_score = score

        pdf_scores.append((pdf_file, combined_score))
        progress_bar['value'] += 1
        progress_label.config(text=f"Progress: {i + 1}/{len(pdf_files)}")
        window.update()

    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    elapsed_time = time.time() - start_time
    eta = elapsed_time / len(pdf_files) * (len(pdf_files) - progress_bar['value'])
    progress_label.config(text=f"Progress: Completed in {elapsed_time:.2f} seconds. ETA: {eta:.2f} seconds.")

    result_box.delete(1.0, tk.END)
    for i, (pdf_path, score) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i+1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        # Add hyperlink functionality for each PDF path
        result_box.tag_configure(f"pdf{i+1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i+1}")
        result_box.tag_bind(f"pdf{i+1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Voltage (V)": [voltage_entry.get()],
        "Current (A)": [current_entry.get()],
        "Efficacy (lm/W)": [efficacy_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Lumens": [lumens_entry.get()],
        "Beam Angles (°)": [beam_angle_entry.get()]
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = tk.Tk()
window.title("PDF Similarity Finder")

# Folder selection
folder_label = tk.Label(window, text="Folder:")
folder_label.grid(row=0, column=0, padx=10, pady=10)
folder_entry = tk.Entry(window, width=40)
folder_entry.grid(row=0, column=1, padx=10, pady=10)
folder_button = tk.Button(window, text="Browse", command=browse_folder)
folder_button.grid(row=0, column=2, padx=10, pady=10)

# Image selection
image_label = tk.Label(window, text="Input Image (Optional):")
image_label.grid(row=1, column=0, padx=10, pady=10)
image_entry = tk.Entry(window, width=40)
image_entry.grid(row=1, column=1, padx=10, pady=10)
image_button = tk.Button(window, text="Browse", command=browse_image)
image_button.grid(row=1, column=2, padx=10, pady=10)

# Input fields for specifications
labels = ["Power (W):", "Voltage (V):", "Current (A):", "Efficacy (lm/W):", "CCT (K):", "Lumens:", "Beam Angle (°):"]
entries = []
for i, label_text in enumerate(labels):
    label = tk.Label(window, text=label_text)
    label.grid(row=i+2, column=0, padx=10, pady=5)
    entry = tk.Entry(window)
    entry.grid(row=i+2, column=1, padx=10, pady=5)
    entries.append(entry)

power_entry, voltage_entry, current_entry, efficacy_entry, cct_entry, lumens_entry, beam_angle_entry = entries

# Result box
result_box = Text(window, height=15, width=80)
result_box.grid(row=10, column=0, columnspan=3, padx=10, pady=10)

# Progress bar and label
progress_bar = ttk.Progressbar(window, orient="horizontal", length=400, mode="determinate")
progress_bar.grid(row=11, column=0, columnspan=2, padx=10, pady=10)
progress_label = tk.Label(window, text="Progress:")
progress_label.grid(row=11, column=2, padx=10, pady=10)

# Submit button
submit_button = tk.Button(window, text="Find Top 10 PDFs", command=process_pdfs)
submit_button.grid(row=12, column=0, columnspan=3, padx=10, pady=20)

# Run the GUI loop
window.mainloop()



MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key i



Beautifying the gui.

In [3]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    pnumber_pattern = r'IK-[A-Z0-9-]+(?:-[0-9/]+)?(?:-\w+)?(?:-\w+)?(?:-\w+)?(?: & \w+)?'
    power_pattern = r'\b(\d{1,4})\s*W\b'
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'

    partnumber = re.findall(pnumber_pattern, text)
    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:
            voltage_ranges.append(volt[2])

    extracted_data = {
        "Ordering Part Number": sorted(set(partnumber)),
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

# Calculate similarity scores
def calculate_similarity(input_data, pdf_data):
    score = 0
    if any(power in pdf_data["Power (W)"] for power in input_data["Power (W)"]):
        score += 1
    if any(voltage in pdf_data["Voltage (V)"] for voltage in input_data["Voltage (V)"]):
        score += 1
    if any(current in pdf_data["Current (A)"] for current in input_data["Current (A)"]):
        score += 1
    if any(efficacy in pdf_data["Efficacy (lm/W)"] for efficacy in input_data["Efficacy (lm/W)"]):
        score += 1
    if any(cct in pdf_data["CCT (K)"] for cct in input_data["CCT (K)"]):
        score += 1
    if any(lumen in pdf_data["Lumens"] for lumen in input_data["Lumens"]):
        score += 1
    if any(beam_angle in pdf_data["Beam Angles (°)"] for beam_angle in input_data["Beam Angles (°)"]):
        score += 1
    
    # Fuzzy matching for more accuracy
    def fuzzy_match(input_list, pdf_list):
        match_score = 0
        for item in input_list:
            best_match = process.extractOne(item, pdf_list, scorer=fuzz.token_sort_ratio)
            if best_match and best_match[1] > 80:  # threshold for a match
                match_score += 1
        return match_score

    # Add fuzzy matching scores
    score += fuzzy_match(input_data["Power (W)"], pdf_data["Power (W)"])
    score += fuzzy_match(input_data["Voltage (V)"], pdf_data["Voltage (V)"])
    score += fuzzy_match(input_data["Current (A)"], pdf_data["Current (A)"])
    score += fuzzy_match(input_data["Efficacy (lm/W)"], pdf_data["Efficacy (lm/W)"])
    score += fuzzy_match(input_data["CCT (K)"], pdf_data["CCT (K)"])
    score += fuzzy_match(input_data["Lumens"], pdf_data["Lumens"])
    score += fuzzy_match(input_data["Beam Angles (°)"], pdf_data["Beam Angles (°)"])

    return score

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

# Process PDFs in folder
def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    pdf_scores = []
    progress_bar['maximum'] = 0  # Reset progress bar maximum

    start_time = time.time()

    # Use os.walk to traverse through all subdirectories
    for root, _, files in os.walk(folder_path):
        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)  # Update progress bar maximum

        for pdf_file in pdf_files:
            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            if input_image_path:
                images = extract_images_from_pdf(pdf_file)
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            pdf_scores.append((pdf_file, combined_score))
            progress_bar['value'] += 1
            progress_label.config(text=f"Progress: {progress_bar['value']} / {progress_bar['maximum']}")
            window.update()

    # Sort and get top 10 results
    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    elapsed_time = time.time() - start_time
    eta = elapsed_time / (progress_bar['maximum'] if progress_bar['maximum'] > 0 else 1) * (progress_bar['maximum'] - progress_bar['value'])
    progress_label.config(text=f"Progress: Completed in {elapsed_time:.2f} seconds. ETA: {eta:.2f} seconds.")

    result_box.delete(1.0, tk.END)
    for i, (pdf_path, score) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")
        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Voltage (V)": [voltage_entry.get()],
        "Current (A)": [current_entry.get()],
        "Efficacy (lm/W)": [efficacy_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Lumens": [lumens_entry.get()],
        "Beam Angles (°)": [beam_angle_entry.get()]
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = tk.Tk()
window.title("PDF Similarity Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Title Label
title_label = tk.Label(window, text="PDF Similarity Finder", font=title_font, bg="#f2f2f2")
title_label.grid(row=0, column=0, columnspan=3, pady=20, sticky="ew")

# Folder selection
folder_label = tk.Label(window, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=1, column=0, padx=10, pady=10, sticky=tk.W)
folder_entry = tk.Entry(window, width=50, font=entry_font)
folder_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(window, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=1, column=2, padx=10, pady=10)

# Image selection
image_label = tk.Label(window, text="Input Image (Optional):", font=label_font, bg="#f2f2f2")
image_label.grid(row=2, column=0, padx=10, pady=10, sticky=tk.W)
image_entry = tk.Entry(window, width=50, font=entry_font)
image_entry.grid(row=2, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(window, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=2, column=2, padx=10, pady=10)

# Input fields for specifications
labels = ["Power (W):", "Voltage (V):", "Current (A):", "Efficacy (lm/W):", "CCT (K):", "Lumens:", "Beam Angle (°):"]
entries = []

for i, label_text in enumerate(labels):
    label = tk.Label(window, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i + 3, column=0, padx=10, pady=5, sticky=tk.W)
    entry = tk.Entry(window, width=50, font=entry_font)
    entry.grid(row=i + 3, column=1, padx=10, pady=5, sticky="ew")
    entries.append(entry)

power_entry, voltage_entry, current_entry, efficacy_entry, cct_entry, lumens_entry, beam_angle_entry = entries

# Progress bar and label
progress_bar = ttk.Progressbar(window, orient="horizontal", length=400, mode="determinate")
progress_bar.grid(row=10, column=0, columnspan=2, padx=10, pady=10, sticky="ew")
progress_label = tk.Label(window, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.grid(row=10, column=2, padx=10, pady=10, sticky=tk.W)

# Result box (scrollable)
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=11, column=0, columnspan=3, padx=10, pady=10, sticky="nsew")
result_frame.grid_columnconfigure(0, weight=1)  # Allow the frame to expand with the window

result_box = Text(result_frame, height=10, width=70, font=entry_font, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

scrollbar = ttk.Scrollbar(result_frame, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Submit button
submit_button = tk.Button(window, text="Find Top 10 PDFs", command=lambda: process_pdfs(), **button_style)
submit_button.grid(row=12, column=0, columnspan=3, pady=20, sticky="ew")

# Image display (right side of the window)
image_display_label = tk.Label(window, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=1, column=3, padx=10, pady=10, sticky=tk.W)
image_canvas = tk.Label(window, bg="#f2f2f2")
image_canvas.grid(row=2, column=3, rowspan=9, padx=10, pady=10, sticky="nsew")  # Placeholder for image

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Function to display the image on the right side
def display_image(image_path):
    # Open the image file
    img = Image.open(image_path)
    
    # Resize the image to fit within a defined area (e.g., 300x300) while maintaining aspect ratio
    img.thumbnail((300, 300))
    
    # Convert the image to a format Tkinter can handle
    img_tk = ImageTk.PhotoImage(img)
    
    # Update the image on the canvas
    image_canvas.config(image=img_tk)
    image_canvas.image = img_tk  # Store a reference to avoid garbage collection

# Run the GUI loop
window.mainloop()



Making adjustments to the image preview and making changes to the progress bar.

In [5]:
import os
import re
import time
import io
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinter import Text
from PIL import Image, ImageTk
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF
from fuzzywuzzy import fuzz  # For string matching
from fuzzywuzzy import process  # For matching
import webbrowser  # To open PDFs
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained models
vgg_model = models.vgg19(pretrained=True)
resnet_model = models.resnet50(pretrained=True)
vgg_model.eval()
resnet_model.eval()

# Image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = fitz.open(file)
        text = ""
        for page_num in range(reader.page_count):
            page = reader.load_page(page_num)
            text += page.get_text()
    return clean_text(text)

# Clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

# Extract specific data from cleaned text
def extract_specifications(text):
    power_pattern = r'\b(\d{1,4})\s*W\b'
    voltage_pattern = r'(\d{2,3})\s*-\s*(\d{2,3})\s*V|\b(\d{1,3})\s*V\b'
    current_pattern = r'(\d{1,3}\.\d{1,3})\s*A'
    lumens_pattern = r'(\d{3,6}(\.\d+)?)\s*Lumens?\b'
    efficacy_pattern = r'\b(\d{2,4}\.\d+|\d{2,4})\s*lm/W\b'
    cct_pattern = r'\b(\d{4})\s*K\b'
    beam_angle_pattern = r'\b(\d{1,3})\s*[°°]\b'

    power = re.findall(power_pattern, text)
    voltage = re.findall(voltage_pattern, text)
    current = re.findall(current_pattern, text)
    lumens = re.findall(lumens_pattern, text)
    efficacy = re.findall(efficacy_pattern, text)
    cct = re.findall(cct_pattern, text)
    beam_angles = re.findall(beam_angle_pattern, text)

    voltage_ranges = []
    for volt in voltage:
        if volt[0] and volt[1]:
            voltage_ranges.append(f"{volt[0]}-{volt[1]}")
        elif volt[2]:
            voltage_ranges.append(volt[2])

    extracted_data = {
        "Power (W)": sorted(set(power), key=int),
        "Voltage (V)": sorted(set(voltage_ranges), key=lambda x: int(x.split('-')[0]) if '-' in x else int(x)),
        "Current (A)": sorted(set(current), key=float),
        "Lumens": sorted(set([lum[0] for lum in lumens]), key=float),
        "Efficacy (lm/W)": sorted(set(efficacy), key=float),
        "CCT (K)": sorted(set(cct), key=int),
        "Beam Angles (°)": sorted(set(beam_angles), key=int)
    }

    return extracted_data

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)
    doc.close()
    return images

# Extract image features
def extract_image_features(image):
    vgg_features = extract_intermediate_features(image, vgg_model, 'features')
    resnet_features = extract_resnet_features(image)
    hog_features = extract_hog_features(image)

    vgg_weight = 0.5
    resnet_weight = 0.3
    hog_weight = 0.2

    combined_features = np.concatenate([
        vgg_features * vgg_weight, 
        resnet_features * resnet_weight, 
        hog_features * hog_weight
    ])

    return combined_features

def extract_intermediate_features(image, model, layer):
    activation = {}
    def hook_fn(module, input, output):
        activation['output'] = output
    handle = model._modules.get(layer).register_forward_hook(hook_fn)
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        model(input_tensor)
    intermediate_features = activation['output']
    handle.remove()
    return intermediate_features.flatten().numpy()

def extract_resnet_features(image):
    input_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        resnet_features = resnet_model(input_tensor)
    return resnet_features.flatten().numpy()

def extract_hog_features(image):
    from skimage.feature import hog
    from skimage.color import rgb2gray
    image = image.resize((128, 128))
    image_np = np.array(image)
    gray_image = rgb2gray(image_np)
    hog_features = hog(gray_image, orientations=8, pixels_per_cell=(16, 16),
                       cells_per_block=(1, 1), visualize=False, feature_vector=True)
    return hog_features

# Calculate similarity scores
def calculate_similarity(input_data, pdf_data):
    score = 0
    if any(power in pdf_data["Power (W)"] for power in input_data["Power (W)"]):
        score += 1
    if any(voltage in pdf_data["Voltage (V)"] for voltage in input_data["Voltage (V)"]):
        score += 1
    if any(current in pdf_data["Current (A)"] for current in input_data["Current (A)"]):
        score += 1
    if any(efficacy in pdf_data["Efficacy (lm/W)"] for efficacy in input_data["Efficacy (lm/W)"]):
        score += 1
    if any(cct in pdf_data["CCT (K)"] for cct in input_data["CCT (K)"]):
        score += 1
    if any(lumen in pdf_data["Lumens"] for lumen in input_data["Lumens"]):
        score += 1
    if any(beam_angle in pdf_data["Beam Angles (°)"] for beam_angle in input_data["Beam Angles (°)"]):
        score += 1
    
    # Fuzzy matching for more accuracy
    def fuzzy_match(input_list, pdf_list):
        match_score = 0
        for item in input_list:
            best_match = process.extractOne(item, pdf_list, scorer=fuzz.token_sort_ratio)
            if best_match and best_match[1] > 80:  # threshold for a match
                match_score += 1
        return match_score

    # Add fuzzy matching scores
    score += fuzzy_match(input_data["Power (W)"], pdf_data["Power (W)"])
    score += fuzzy_match(input_data["Voltage (V)"], pdf_data["Voltage (V)"])
    score += fuzzy_match(input_data["Current (A)"], pdf_data["Current (A)"])
    score += fuzzy_match(input_data["Efficacy (lm/W)"], pdf_data["Efficacy (lm/W)"])
    score += fuzzy_match(input_data["CCT (K)"], pdf_data["CCT (K)"])
    score += fuzzy_match(input_data["Lumens"], pdf_data["Lumens"])
    score += fuzzy_match(input_data["Beam Angles (°)"], pdf_data["Beam Angles (°)"])

    return score

def calculate_image_similarity(features1, features2):
    if features1.size == 0 or features2.size == 0:
        return 0.0
    features1 = features1.reshape(1, -1)
    features2 = features2.reshape(1, -1)
    similarity = cosine_similarity(features1, features2)[0][0]
    return similarity

def find_top_10_similar_pdfs(folder_path, input_data, input_image_path=None):
    pdf_scores = []
    progress_bar['maximum'] = 0  # Reset progress bar maximum

    start_time = time.time()

    # Use os.walk to traverse through all subdirectories
    for root, _, files in os.walk(folder_path):
        pdf_files = [os.path.join(root, f) for f in files if f.endswith('.pdf')]
        progress_bar['maximum'] += len(pdf_files)  # Update progress bar maximum

        for pdf_file in pdf_files:
            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_specifications(pdf_text)
            score = calculate_similarity(input_data, extracted_data)

            if input_image_path:
                images = extract_images_from_pdf(pdf_file)
                input_image = Image.open(input_image_path).convert('RGB')
                input_image_features = extract_image_features(input_image)

                max_image_similarity = 0
                for img in images:
                    image_features = extract_image_features(img)
                    image_similarity = calculate_image_similarity(input_image_features, image_features)
                    max_image_similarity = max(max_image_similarity, image_similarity)

                combined_score = score + max_image_similarity
            else:
                combined_score = score

            pdf_scores.append((pdf_file, combined_score))
            progress_bar['value'] += 1
            # Calculate elapsed time and ETA
            elapsed_time = time.time() - start_time
            remaining_pdfs = len(pdf_scores) - (progress_bar['value'])
            if progress_bar['value'] > 0:  # Avoid division by zero
                estimated_time_per_pdf = elapsed_time / progress_bar['value']
                eta = remaining_pdfs * estimated_time_per_pdf
                progress_label.config(text=f"Progress: {progress_bar['value']}/{progress_bar['maximum']}, ETA: {eta:.2f} seconds")

            window.update()

    # Sort and get top 10 results
    pdf_scores = sorted(pdf_scores, key=lambda x: x[1], reverse=True)[:10]

    # Reset progress bar and label
    progress_bar['value'] = 0
    progress_label.config(text="Progress: 0%")

    result_box.delete(1.0, tk.END)
    for i, (pdf_path, score) in enumerate(pdf_scores):
        result_box.insert(tk.END, f"Top {i + 1}: {os.path.basename(pdf_path)} | Similarity Score: {score:.4f}\n")
        # Add hyperlink functionality for each PDF path
        result_box.tag_configure(f"pdf{i + 1}", foreground="blue", underline=True)
        result_box.insert(tk.END, f"{pdf_path}\n", f"pdf{i + 1}")
        result_box.tag_bind(f"pdf{i + 1}", "<Button-1>", lambda e, p=pdf_path: open_pdf(p))

# Open PDF
def open_pdf(pdf_path):
    webbrowser.open(f"file://{pdf_path}")

# GUI Setup
def browse_folder():
    folder_selected = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_selected)

def browse_image():
    image_file = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
    image_entry.delete(0, tk.END)
    image_entry.insert(0, image_file)

def process_pdfs():
    folder_path = folder_entry.get()
    input_image_path = image_entry.get()

    input_data = {
        "Power (W)": [power_entry.get()],
        "Voltage (V)": [voltage_entry.get()],
        "Current (A)": [current_entry.get()],
        "Efficacy (lm/W)": [efficacy_entry.get()],
        "CCT (K)": [cct_entry.get()],
        "Lumens": [lumens_entry.get()],
        "Beam Angles (°)": [beam_angle_entry.get()]
    }

    if not folder_path:
        messagebox.showerror("Error", "Please select a folder.")
        return

    find_top_10_similar_pdfs(folder_path, input_data, input_image_path)

# Create the main window
window = tk.Tk()
window.title("PDF Similarity Finder")
window.geometry("1000x600")  # Increased window width for image display
window.configure(bg="#f2f2f2")  # Light grey background

# Allow the window to be resizable
window.columnconfigure(0, weight=1)
window.columnconfigure(1, weight=3)
window.columnconfigure(2, weight=1)
window.columnconfigure(3, weight=1)
window.rowconfigure(11, weight=1)

# Global font settings
title_font = ("Helvetica", 16, "bold")
label_font = ("Helvetica", 12)
entry_font = ("Helvetica", 10)

# Styling for the buttons
button_style = {
    "bg": "#4CAF50",
    "fg": "white",
    "font": ("Helvetica", 10, "bold"),
    "activebackground": "#45a049",
    "relief": tk.RAISED
}

# Title Label
title_label = tk.Label(window, text="PDF Similarity Finder", font=title_font, bg="#f2f2f2")
title_label.grid(row=0, column=0, columnspan=3, pady=20, sticky="ew")

# Folder selection
folder_label = tk.Label(window, text="Select Folder:", font=label_font, bg="#f2f2f2")
folder_label.grid(row=1, column=0, padx=10, pady=10, sticky=tk.W)
folder_entry = tk.Entry(window, width=50, font=entry_font)
folder_entry.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
folder_button = tk.Button(window, text="Browse", command=lambda: browse_folder(), **button_style)
folder_button.grid(row=1, column=2, padx=10, pady=10)

# Image selection
image_label = tk.Label(window, text="Input Image (Optional):", font=label_font, bg="#f2f2f2")
image_label.grid(row=2, column=0, padx=10, pady=10, sticky=tk.W)
image_entry = tk.Entry(window, width=50, font=entry_font)
image_entry.grid(row=2, column=1, padx=10, pady=10, sticky="ew")
image_button = tk.Button(window, text="Browse", command=lambda: browse_image(), **button_style)
image_button.grid(row=2, column=2, padx=10, pady=10)

# Input fields for specifications
labels = ["Power (W):", "Voltage (V):", "Current (A):", "Efficacy (lm/W):", "CCT (K):", "Lumens:", "Beam Angle (°):"]
entries = []

for i, label_text in enumerate(labels):
    label = tk.Label(window, text=label_text, font=label_font, bg="#f2f2f2")
    label.grid(row=i + 3, column=0, padx=10, pady=5, sticky=tk.W)
    entry = tk.Entry(window, width=50, font=entry_font)
    entry.grid(row=i + 3, column=1, padx=10, pady=5, sticky="ew")
    entries.append(entry)

power_entry, voltage_entry, current_entry, efficacy_entry, cct_entry, lumens_entry, beam_angle_entry = entries

# Progress bar and label
progress_bar = ttk.Progressbar(window, orient="horizontal", length=400, mode="determinate")
progress_bar.grid(row=10, column=0, columnspan=2, padx=10, pady=10, sticky="ew")
progress_label = tk.Label(window, text="Progress: 0%", font=label_font, bg="#f2f2f2")
progress_label.grid(row=10, column=2, padx=10, pady=10, sticky=tk.W)

# Result box (scrollable)
result_frame = tk.Frame(window, bg="#f2f2f2")
result_frame.grid(row=11, column=0, columnspan=3, padx=10, pady=10, sticky="nsew")
result_frame.grid_columnconfigure(0, weight=1)  # Allow the frame to expand with the window

result_box = Text(result_frame, height=10, width=70, font=entry_font, wrap=tk.WORD)
result_box.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

scrollbar = ttk.Scrollbar(result_frame, command=result_box.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_box.config(yscrollcommand=scrollbar.set)

# Submit button
submit_button = tk.Button(window, text="Find Top 10 PDFs", command=lambda: process_pdfs(), **button_style)
submit_button.grid(row=12, column=0, columnspan=3, pady=20, sticky="ew")

# Image display (right side of the window)
image_display_label = tk.Label(window, text="Image Preview", font=label_font, bg="#f2f2f2")
image_display_label.grid(row=1, column=3, padx=10, pady=(10, 5), sticky=tk.NW)  # Align at top-left of the right side
image_canvas = tk.Label(window, bg="#f2f2f2")
image_canvas.grid(row=2, column=3, padx=10, pady=(0, 20), sticky=tk.NW)  # Ensure it's right under the label

# Function for folder selection
def browse_folder():
    folder_path = filedialog.askdirectory()
    folder_entry.delete(0, tk.END)
    folder_entry.insert(0, folder_path)

# Function for image selection and display
def browse_image():
    image_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png *.jpeg")])
    if image_path:
        image_entry.delete(0, tk.END)
        image_entry.insert(0, image_path)
        display_image(image_path)

# Function to display the image on the right side
def display_image(image_path):
    # Open the image file
    img = Image.open(image_path)
    
    # Resize the image to fit within a defined area (e.g., 300x300) while maintaining aspect ratio
    img.thumbnail((300, 300))
    
    # Convert the image to a format Tkinter can handle
    img_tk = ImageTk.PhotoImage(img)
    
    # Update the image on the canvas
    image_canvas.config(image=img_tk)
    image_canvas.image = img_tk  # Store a reference to avoid garbage collection

# Run the GUI loop
window.mainloop()

This is the final code that has to be implemented to get the desired UI.