In [52]:
!pip install PyPDF2 sentence-transformers

Collecting sentence-transformers
  Obtaining dependency information for sentence-transformers from https://files.pythonhosted.org/packages/87/fa/83b9890e4835f7ba7decbe52b5264785a10def6b3fa450506751bd3a28ca/sentence_transformers-3.3.0-py3-none-any.whl.metadata
  Downloading sentence_transformers-3.3.0-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Obtaining dependency information for transformers<5.0.0,>=4.41.0 from https://files.pythonhosted.org/packages/ed/ad/c9b96572ab7994e73c64588f8875741823f2daba70e746547fff9a2d9a54/transformers-4.46.2-py3-none-any.whl.metadata
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m664.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tokenizers<0.21,>=0.20 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Obtaining dependency information for tokenizers<0.21,>=0.20 from https

In [1]:
import PyPDF2
import nltk
from sentence_transformers import SentenceTransformer, util
import re
from tkinter import Tk, Button, Entry, Label, filedialog, Text, Scrollbar, Frame
from tkinter.ttk import Progressbar
import torch
import pandas as pd
from tqdm import tqdm

device = torch.device("mps" if torch.has_mps else "cpu")
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

  device = torch.device("mps" if torch.has_mps else "cpu")


In [2]:
def read_pdf_files(pdf_paths):
    pdf_text_data = {}
    for pdf_path in pdf_paths:
        lines = []
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                page_text = page.extract_text()
                if page_text:
                    cleaned_text = re.sub(r'[^a-zA-Z0-9.,\s]\n', '', page_text)
                    lines += [line.strip() for line in re.split(r'[.,]', cleaned_text) if line.strip()]
        pdf_text_data[pdf_path] = lines
    return pdf_text_data

def get_top_k_similar_lines(query_line, pdf_text_data, top_k=5):
    query_embedding = model.encode(query_line, convert_to_tensor=True).to(device)
    top_k_matches = []

    for pdf_path, lines in pdf_text_data.items():
        line_embeddings = model.encode(lines, convert_to_tensor=True).to(device)
        
        similarity_scores = util.cos_sim(query_embedding, line_embeddings)[0]
        
        top_k_indices = similarity_scores.topk(k=top_k).indices.tolist()
        
        for idx in tqdm(top_k_indices):
            score = similarity_scores[idx].item()
            matched_line = lines[idx]
            top_k_matches.append((score, pdf_path, matched_line))
    
    top_k_matches = sorted(top_k_matches, key=lambda x: x[0], reverse=True)[:top_k]
    top_k_matches = pd.DataFrame(top_k_matches)
    top_k_matches.columns = ['Similarity', 'Path', 'Line']    
    
    return top_k_matches

def upload_files():
    file_paths = filedialog.askopenfilenames(filetypes=[("PDF files", "*.pdf")])
    if file_paths:
        files_label.config(text=f"Files Uploaded: {len(file_paths)} files")
    return file_paths

def process_query():
    query_text = query_entry.get("1.0", "end-1c")
    query_lines = [line.strip() for line in query_text.split('.') if line.strip()]
    
    pdf_files = uploaded_files
    pdf_text_data = read_pdf_files(pdf_files)
    
    result_text.delete(1.0, "end")
    progress_bar['maximum'] = len(query_lines)
    progress_bar['value'] = 0
    root.update_idletasks()

    for idx, query_line in enumerate(query_lines):
        top_k_matches = get_top_k_similar_lines(query_line, pdf_text_data, top_k=5)
        result_text.insert("end", f"Query: {query_line}\n\n")
        result_text.insert("end", top_k_matches.to_string(index=False))
        result_text.insert("end", "\n\n" + "-"*50 + "\n\n")
        progress_bar['value'] = idx + 1
        root.update_idletasks()


In [None]:
root = Tk()
root.title("PDF Similarity Finder")
root.geometry("800x600")
root.config(bg="#f7f7f7")

uploaded_files = []

header_frame = Frame(root, bg="#1f4e79", padx=10, pady=5)
header_frame.pack(fill="x")

header_label = Label(header_frame, text="PDF Similarity Finder", font=("Helvetica", 18), fg="white", bg="#1f4e79")
header_label.pack()

upload_button = Button(root, text="Upload PDF Files", command=lambda: upload_files(), bg="#4CAF50", fg="white", font=("Helvetica", 12), relief="flat", width=20)
upload_button.pack(pady=20)

files_label = Label(root, text="No files uploaded yet.", font=("Helvetica", 12), bg="#f7f7f7")
files_label.pack(pady=5)

query_label = Label(root, text="Enter your query text:", font=("Helvetica", 12), bg="#f7f7f7")
query_label.pack(pady=10)

query_entry = Text(root, height=5, width=70, font=("Helvetica", 12))
query_entry.pack(pady=10)

process_button = Button(root, text="Find Similarity", command=process_query, bg="#2196F3", fg="white", font=("Helvetica", 12), relief="flat", width=20)
process_button.pack(pady=20)

progress_bar = Progressbar(root, orient="horizontal", length=500, mode="determinate")
progress_bar.pack(pady=10)

result_text = Text(root, height=15, width=90, font=("Helvetica", 12))
result_text.pack(pady=10)

scrollbar = Scrollbar(root, command=result_text.yview)
scrollbar.pack(side="right", fill="y")
result_text.config(yscrollcommand=scrollbar.set)

root.mainloop()