### Step 1: Data Exploration

In [1]:
import pandas as pd
import numpy as np

# Loading the dataset
df = pd.read_csv('questions.csv')  


### Step 2: Data Preparation and Cleaning:

In [None]:
import pandas as pd

# Loading the dataset
df = pd.read_csv('questions.csv')  

# Step 2.1: Basic Cleaning
df_clean = df.dropna(subset=['question1', 'question2'])

# Converting to string
df_clean['question1'] = df_clean['question1'].astype(str)
df_clean['question2'] = df_clean['question2'].astype(str)

# Step 2.2: Extracting unique questions using pandas
# Combining both question columns into one series
all_questions_series = pd.concat([
    df_clean['question1'], 
    df_clean['question2']
], ignore_index=True)

# Getting unique questions 
unique_questions = all_questions_series.drop_duplicates().reset_index(drop=True)

# Step 2.3: Creating questions database
questions_db = pd.DataFrame({
    'question_id': range(len(unique_questions)),
    'question_text': unique_questions
})

# Adding metadata
questions_db['question_length'] = questions_db['question_text'].str.len()
questions_db['word_count'] = questions_db['question_text'].str.split().str.len()

# Step 2.4: Saving files
questions_db.to_csv('unique_questions.csv', index=False)


### Step 3: SBERT Embeddings Generation

In [3]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle
import time

# Step 4.1: Load the pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and good quality

# Step 4.2: Load your prepared questions
questions_df = pd.read_csv('unique_questions.csv')

# For testing, let's start with a smaller batch
BATCH_SIZE = 20000  
questions_sample = questions_df.head(BATCH_SIZE).copy()

questions_list = questions_sample['question_text'].tolist()

# Step 4.3: Generate embeddings with progress tracking
start_time = time.time()

# Generate embeddings in batches to avoid memory issues
batch_size = 500  # Process 500 questions at a time
embeddings_list = []

for i in range(0, len(questions_list), batch_size):
    batch = questions_list[i:i+batch_size]
    batch_embeddings = model.encode(batch, 
                                   convert_to_tensor=False,
                                   show_progress_bar=False)
    embeddings_list.extend(batch_embeddings)

# Convert to numpy array
embeddings = np.array(embeddings_list)
end_time = time.time()

# Step 4.4: Save embeddings and metadata

# Save embeddings
np.save('question_embeddings.npy', embeddings)

# Save metadata
embeddings_metadata = {
    'model_name': 'all-MiniLM-L6-v2',
    'embedding_dim': embeddings.shape[1],
    'num_questions': embeddings.shape[0],
    'generation_time': end_time - start_time,
    'batch_size': BATCH_SIZE
}

with open('embeddings_metadata.pkl', 'wb') as f:
    pickle.dump(embeddings_metadata, f)

# Save the questions with IDs for reference
questions_sample.to_csv('processed_questions.csv', index=False)


### Step 4:  Implement Cosine Similarity Search 

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import pickle
import time

class QuestionSimilarityFinder:
    def __init__(self):
        self.embeddings = None
        self.questions_df = None
        self.model = None
        self.metadata = None
        
    def load_data(self):
        """Load all necessary data and models"""
        
        # Load embeddings
        self.embeddings = np.load('question_embeddings.npy')
        
        # Load questions
        self.questions_df = pd.read_csv('processed_questions.csv')
        
        # Load model for encoding new questions
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Load metadata
        with open('embeddings_metadata.pkl', 'rb') as f:
            self.metadata = pickle.load(f)
        
    def find_similar_questions(self, query_question, top_k=5, similarity_threshold=0.8):
        """
        Find the most similar questions to a given query
        
        Args:
            query_question (str): The question to find similarities for
            top_k (int): Number of top similar questions to return
            similarity_threshold (float): Minimum similarity score to consider
            
        Returns:
            list: List of similar questions with their similarity scores
        """
        #print(f"\n=== FINDING SIMILAR QUESTIONS ===")
        #print(f"Query: '{query_question}'")
        
        # Encode the query question
        start_time = time.time()
        query_embedding = self.model.encode([query_question])
        
        # Calculate cosine similarity with all questions
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]
        
        # Get top-k similar questions
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        end_time = time.time()
        
        results = []
        for i, idx in enumerate(top_indices):
            similarity_score = similarities[idx]
            if similarity_score >= similarity_threshold:
                question_data = self.questions_df.iloc[idx]
                results.append({
                    'rank': i + 1,
                    'question_id': question_data['question_id'],
                    'question_text': question_data['question_text'],
                    'similarity_score': similarity_score,
                    'question_length': question_data['question_length'],
                    'word_count': question_data['word_count']
                })
        
        #print(f"✓ Search completed in {end_time - start_time:.4f} seconds")
        #print(f"✓ Found {len(results)} similar questions above threshold ({similarity_threshold})")
        
        return results
    
    def display_results(self, results):
        """Display search results in a formatted way"""
        if not results:
            #print("No similar questions found above the threshold.")
            return
            
        #print("\n" + "="*80)
        #print("SIMILAR QUESTIONS FOUND:")
        #print("="*80)
        
        #for result in results:
            #print(f"\nRank {result['rank']} | Similarity: {result['similarity_score']:.4f}")
            #print(f"Question: {result['question_text']}")
            #print(f"Stats: {result['word_count']} words, {result['question_length']} characters")
            #print("-" * 60)
    



# Output

In [7]:
import tkinter as tk
from tkinter import messagebox, scrolledtext
from PIL import Image, ImageTk
import threading


class QuestionSimilarityFinderGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("Question Similarity Finder")
        self.root.geometry("700x600")
        self.root.configure(bg="#f6fff6")

        self.finder = QuestionSimilarityFinder()
        self.finder.load_data()

        self.create_widgets()

    def create_widgets(self):
        # GIF Image
        try:
            gif_img = Image.open("assets/Astronaut with space shuttle.gif")
            gif_img = gif_img.resize((150, 150))
            self.gif_photo = ImageTk.PhotoImage(gif_img)
            gif_label = tk.Label(self.root, image=self.gif_photo, bg="#f6fff6")
            gif_label.pack(pady=(10, 0))
        except:
            pass

        # Input field
        self.input_entry = tk.Entry(self.root, font=("Segoe UI", 12), width=70)
        self.input_entry.pack(pady=10)

        # Buttons
        button_frame = tk.Frame(self.root, bg="#f6fff6")
        button_frame.pack(pady=5)

        search_btn = tk.Button(button_frame, text="Search", bg="#165497", fg="white", font=("Segoe UI", 10, "bold"),
                               width=10, command=self.search_thread)
        search_btn.pack(side=tk.LEFT, padx=10)

        clear_btn = tk.Button(button_frame, text="Clear", bg="#DC3545", fg="white", font=("Segoe UI", 10, "bold"),
                              width=10, command=self.clear_output)
        clear_btn.pack(side=tk.LEFT, padx=10)

        # Output area
        self.output_box = scrolledtext.ScrolledText(self.root, font=("Segoe UI", 10), wrap=tk.WORD, width=85, height=20,
                                                    bg="#ffffff", bd=2, relief="solid")
        self.output_box.pack(pady=15)
        self.output_box.config(state=tk.DISABLED)

    def search_thread(self):
        threading.Thread(target=self.search_question).start()

    def search_question(self):
        query = self.input_entry.get().strip()
        self.clear_output()
        if not query:
            self.show_output("⚠️ Please enter a valid question.")
            return

        self.show_output("🔍 Searching for similar questions...\n")
        try:
            results = self.finder.find_similar_questions(query, top_k=5, similarity_threshold=0.7)
            if not results:
                self.show_output("⚠️ No similar questions found above threshold.")
            else:
                formatted = self.format_results(results)
                self.show_output(formatted)
        except Exception as e:
            self.show_output(f"❌ Error: {str(e)}")

    def format_results(self, results):
        output = ""
        for result in results:
            output += (
                f"🔹 Rank {result['rank']}\n"
                f"   Question: {result['question_text']}\n"
                f"   Similarity: {result['similarity_score']:.4f}\n\n"
            )
        return output

    def show_output(self, text):
        self.output_box.config(state=tk.NORMAL)
        self.output_box.insert(tk.END, text + "\n")
        self.output_box.config(state=tk.DISABLED)

    def clear_output(self):
        self.output_box.config(state=tk.NORMAL)
        self.output_box.delete(1.0, tk.END)
        self.output_box.config(state=tk.DISABLED)
        self.input_entry.delete(0, tk.END)

# --- Run App ---
if __name__ == "__main__":
    root = tk.Tk()
    app = QuestionSimilarityFinderGUI(root)
    root.mainloop()
