In [2]:
import tkinter as tk
from tkinter import filedialog, messagebox
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import fitz  # PyMuPDF for PDFs
from docx import Document
import os
import tkinter.ttk as ttk
import numpy as np
from PIL import Image, ImageTk


In [3]:
def read_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.pdf':
        return read_text_from_pdf(file_path)
    elif ext == '.docx':
        return read_text_from_docx(file_path)
    elif ext == '.txt':
        with open(file_path, 'r') as file:
            return file.read()
    else:
        raise ValueError("Unsupported file format")


In [4]:
def read_text_from_pdf(file_path):
    text = ""
    doc = fitz.open(file_path)
    for page in doc:
        text += page.get_text()
    doc.close()
    return text


In [5]:
def read_text_from_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])


In [6]:
def compute_similarity(doc1_path, doc2_path):
    text1 = read_text_from_file(doc1_path)
    text2 = read_text_from_file(doc2_path)
    documents = [text1, text2]

    vectorizer = TfidfVectorizer(stop_words='english').fit(documents)
    vectors = vectorizer.transform(documents)
    cosine_sim = cosine_similarity(vectors)
    
    similarity_percentage = round(cosine_sim[0][1] * 100, 2)

    feature_names = vectorizer.get_feature_names_out()
    vector1 = vectors[0].toarray().flatten()
    vector2 = vectors[1].toarray().flatten()
    
    important_word_indices = np.where((vector1 > 0) & (vector2 > 0))[0]
    word_scores = vector1[important_word_indices] * vector2[important_word_indices]
    sorted_indices = np.argsort(word_scores)[::-1]
    
    important_similar_words = [(feature_names[important_word_indices[i]], round(word_scores[i], 4)) 
                               for i in sorted_indices[:10]]

    return similarity_percentage, important_similar_words


In [7]:
def upload_file1():
    global file1_path
    file1_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf"), 
                                                       ("Word Files", "*.docx"), 
                                                       ("Text Files", "*.txt")])
    file1_label.config(text=os.path.basename(file1_path))

def upload_file2():
    global file2_path
    file2_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf"), 
                                                       ("Word Files", "*.docx"), 
                                                       ("Text Files", "*.txt")])
    file2_label.config(text=os.path.basename(file2_path))


In [8]:
def compute():
    if not (file1_path and file2_path):
        messagebox.showerror("Error", "Please select both documents.")
        return

    try:
        progress_bar.start()
        similarity_percentage, important_similar_words = compute_similarity(file1_path, file2_path)
        
        result_message = f'Similarity Percentage: {similarity_percentage}%\n\nImportant Similar Words:\n'
        for word, score in important_similar_words:
            result_message += f"{word}: {score}\n"
        
        messagebox.showinfo("Similarity Result", result_message)
    except Exception as e:
        messagebox.showerror("Error", str(e))
    finally:
        progress_bar.stop()


In [9]:
def clear_selection():
    global file1_path, file2_path
    file1_path = None
    file2_path = None
    file1_label.config(text="No file selected")
    file2_label.config(text="No file selected")


In [33]:
def upload_file1():
    global file1_path
    file1_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf"), 
                                                       ("Word Files", "*.docx"), 
                                                       ("Text Files", "*.txt")])
    file1_label.config(text=os.path.basename(file1_path))

def upload_file2():
    global file2_path
    file2_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf"), 
                                                       ("Word Files", "*.docx"), 
                                                       ("Text Files", "*.txt")])
    file2_label.config(text=os.path.basename(file2_path))

def compute():
    if not (file1_path and file2_path):
        messagebox.showerror("Error", "Please select both documents.")
        return

    try:
        progress_bar.start()
        similarity_percentage, important_similar_words = compute_similarity(file1_path, file2_path)
        
        result_message = f'Similarity Percentage: {similarity_percentage}%\n\nImportant Similar Words:\n'
        for word, score in important_similar_words:
            result_message += f"{word}: {score}\n"
        
        messagebox.showinfo("Similarity Result", result_message)
    except Exception as e:
        messagebox.showerror("Error", str(e))
    finally:
        progress_bar.stop()

def clear_selection():
    global file1_path, file2_path
    file1_path = None
    file2_path = None
    file1_label.config(text="No file selected")
    file2_label.config(text="No file selected")

root = tk.Tk()
root.title("Document Similarity Checker")
root.attributes('-fullscreen', True)

# Attempt to load and resize the background image using Pillow
try:
    image_path = "C://Users//Bhargavi//Downloads//10.png"
    bg_image_pil = Image.open(image_path)
    
    # Resize the image to fit the screen
    screen_width = root.winfo_screenwidth()
    screen_height = root.winfo_screenheight()
    
    # Use Image.Resampling.LANCZOS instead of Image.ANTIALIAS
    bg_image_pil = bg_image_pil.resize((screen_width, screen_height), Image.Resampling.LANCZOS)
    
    bg_image = ImageTk.PhotoImage(bg_image_pil)  # Convert the PIL image to PhotoImage
    bg_label = tk.Label(root, image=bg_image)
    bg_label.place(x=0, y=0, relwidth=1, relheight=1)
except Exception as e:
    messagebox.showerror("Error", f"Background image not found or unable to resize: {e}")

header_label = tk.Label(root, text="Document Similarity Checker", font=("Arial", 24, "bold"), bg="#87ceeb", fg="#fff")
header_label.place(relx=0.5, rely=0.05, anchor="center")

frame = tk.Frame(root, bg="#e0e0e0", padx=20, pady=20)
frame.place(relx=0.5, rely=0.5, anchor="center")

# Side by Side Document Uploads
tk.Label(frame, text="Document 1:", font=("Arial", 12), bg="#e6e6fa").grid(row=0, column=0, padx=10, pady=10)
file1_label = tk.Label(frame, text="No file selected", font=("Arial", 12), bg="#e6e6fa", fg="#333")
file1_label.grid(row=0, column=1, padx=10, pady=10)
tk.Button(frame, text="Upload Document 1", command=upload_file1, font=("Arial", 12), bg="#4caf50", fg="#fff").grid(row=0, column=2, padx=10, pady=10)

tk.Label(frame, text="Document 2:", font=("Arial", 12), bg="#e6e6fa").grid(row=1, column=0, padx=10, pady=10)
file2_label = tk.Label(frame, text="No file selected", font=("Arial", 12), bg="#e6e6fa", fg="#333")
file2_label.grid(row=1, column=1, padx=10, pady=10)
tk.Button(frame, text="Upload Document 2", command=upload_file2, font=("Arial", 12), bg="#4caf50", fg="#fff").grid(row=1, column=2, padx=10, pady=10)

tk.Button(root, text="Compute Similarity", command=compute, font=("Arial", 16, "bold"), bg="#ff5722", fg="#fff").place(relx=0.5, rely=0.7, anchor="center")
tk.Button(root, text="Clear Selections", command=clear_selection, font=("Arial", 12), bg="#f44336", fg="#fff").place(relx=0.5, rely=0.8, anchor="center")

progress_bar = ttk.Progressbar(root, orient="horizontal", mode="indeterminate")
progress_bar.place(relx=0.5, rely=0.9, relwidth=0.8, anchor="center")

root.mainloop()