<a href="https://colab.research.google.com/github/Manya123-max/Assesments/blob/main/Quote_Retrieval_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Cell 1 - Package Installation:

In [32]:
# Install Required Packages
print("Installing required packages...")

!pip install -q sentence-transformers datasets transformers torch torchvision torchaudio
!pip install -q faiss-cpu pandas numpy scikit-learn
!pip install -q gradio
!pip install -q huggingface_hub accelerate fsspec

print("All packages installed successfully!")
print("Please restart runtime if prompted, then proceed to Step 2")

Installing required packages...
All packages installed successfully!
Please restart runtime if prompted, then proceed to Step 2


Cell 2 -Import Libraries and Setup

In [33]:
# Import Libraries and Setup
import os
import json
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import faiss
from transformers import pipeline
import gradio as gr
import warnings
warnings.filterwarnings('ignore')

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print("Libraries imported successfully!")

Using device: cpu
Libraries imported successfully!


Cell 3- Data Processing Class

In [34]:
# Quote Data Processor Class
class ColabQuoteDataProcessor:
    def __init__(self):
        self.dataset = None
        self.processed_data = None

    def load_data(self, max_samples=5000):
        """Load dataset using pandas read_json from HuggingFace"""
        print(" Loading dataset from HuggingFace...")
        try:
            # Load dataset using the specified method
            df = pd.read_json("hf://datasets/Abirate/english_quotes/quotes.jsonl", lines=True)
            print(f"Dataset loaded successfully. Total size: {len(df)}")

            # Limit dataset size for Colab memory constraints
            if len(df) > max_samples:
                df = df.sample(n=max_samples, random_state=42).reset_index(drop=True)
                print(f"Randomly sampled {max_samples} quotes for Colab optimization")

            # Store as dataset format for compatibility
            self.dataset = {"train": df}
            print(f" Dataset ready with {len(df)} quotes")
            return self.dataset

        except Exception as e:
            print(f" Error loading dataset with pandas method: {e}")
            print(" Trying alternative HuggingFace datasets library...")
            try:
                # Fallback to datasets library
                from datasets import load_dataset
                dataset = load_dataset("Abirate/english_quotes")
                df = pd.DataFrame(dataset['train'])

                if len(df) > max_samples:
                    df = df.sample(n=max_samples, random_state=42).reset_index(drop=True)
                    print(f"Fallback: Limited to {max_samples} samples")

                self.dataset = {"train": df}
                return self.dataset

            except Exception as e2:
                print(f" Fallback also failed: {e2}")
                # Create sample data if both methods fail
                return self.create_sample_data()

    def create_sample_data(self):
        """Create sample data if dataset loading fails"""
        print("🔧 Creating sample dataset as fallback...")
        sample_quotes = [
            {"quote": "The only way to do great work is to love what you do.", "author": "Steve Jobs", "tags": ["motivation", "work", "success"]},
            {"quote": "Life is what happens to you while you're busy making other plans.", "author": "John Lennon", "tags": ["life", "philosophy"]},
            {"quote": "The future belongs to those who believe in the beauty of their dreams.", "author": "Eleanor Roosevelt", "tags": ["dreams", "future", "hope"]},
            {"quote": "It is during our darkest moments that we must focus to see the light.", "author": "Aristotle", "tags": ["hope", "perseverance"]},
            {"quote": "The way to get started is to quit talking and begin doing.", "author": "Walt Disney", "tags": ["action", "motivation"]},
            {"quote": "Your time is limited, don't waste it living someone else's life.", "author": "Steve Jobs", "tags": ["life", "authenticity"]},
            {"quote": "If life were predictable it would cease to be life, and be without flavor.", "author": "Eleanor Roosevelt", "tags": ["life", "unpredictability"]},
            {"quote": "The only impossible journey is the one you never begin.", "author": "Tony Robbins", "tags": ["journey", "motivation"]},
            {"quote": "In the end, we will remember not the words of our enemies, but the silence of our friends.", "author": "Martin Luther King Jr.", "tags": ["friendship", "courage"]},
            {"quote": "Success is not final, failure is not fatal: it is the courage to continue that counts.", "author": "Winston Churchill", "tags": ["success", "failure", "courage"]}
        ]

        # Create dataset structure
        df = pd.DataFrame(sample_quotes)
        self.dataset = {"train": df}
        print(f"Sample dataset created with {len(df)} quotes")
        return self.dataset

    def preprocess_data(self):
        """Clean and preprocess the dataset"""
        print("Preprocessing data...")

        # Get DataFrame from dataset
        if isinstance(self.dataset['train'], pd.DataFrame):
            df = self.dataset['train'].copy()
        else:
            df = pd.DataFrame(self.dataset['train'])

        print(f"Original dataset shape: {df.shape}")
        print(f"Columns: {df.columns.tolist()}")

        # Display sample data
        print("Sample data:")
        print(df.head(2))

        # Handle missing values
        print("Handling missing values...")
        initial_size = len(df)
        df = df.dropna(subset=['quote', 'author'])
        print(f"Removed {initial_size - len(df)} rows with missing quote/author")

        # Clean text
        df['quote_clean'] = df['quote'].astype(str).str.strip()
        df['author_clean'] = df['author'].astype(str).str.strip()

        # Handle tags - check if tags column exists and handle different formats
        if 'tags' in df.columns:
            print(" Processing tags column...")
            df['tags'] = df['tags'].apply(lambda x:
                x if isinstance(x, list)
                else [x] if isinstance(x, str) and x.strip()
                else []
            )
        else:
            print("No tags column found, creating empty tags")
            df['tags'] = [[] for _ in range(len(df))]

        # Create search text for embedding
        df['search_text'] = df.apply(
            lambda row: f"Quote: {row['quote']} Author: {row['author']} Tags: {', '.join(row['tags']) if row['tags'] else 'no tags'}",
            axis=1
        )

        self.processed_data = df.reset_index(drop=True)
        print(f"Data preprocessing completed!")
        print(f"Final dataset size: {len(df)} quotes")
        print(f"Sample search text: {df['search_text'].iloc[0][:100]}...")

        return df

print("Data Processing Class defined successfully!")

Data Processing Class defined successfully!


Cell 4 - Embedding Model Class

In [35]:
# Quote Embedding Model Class
class ColabQuoteEmbeddingModel:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model = None
        self.device = device

    def load_model(self):
        """Load the sentence transformer model"""
        print(f" Loading model: {self.model_name}")
        try:
            self.model = SentenceTransformer(self.model_name, device=str(self.device))
            print(f" Model loaded successfully on {self.device}")
        except Exception as e:
            print(f" Error loading model: {e}")
            # Fallback to CPU
            self.model = SentenceTransformer(self.model_name, device='cpu')
            print(" Loaded model on CPU")
        return self.model

print(" Embedding Model Class defined successfully!")

 Embedding Model Class defined successfully!


Cell 5 - RAG Pipeline Class

In [36]:
# Enhanced Quote RAG Pipeline Class with Multi-hop and Analytics
class ColabQuoteRAGPipeline:
    def __init__(self, embedding_model):
        self.embedding_model = embedding_model
        self.index = None
        self.quotes_data = None
        self.embeddings = None

    def create_embeddings(self, quotes_data):
        """Create embeddings for all quotes"""
        print("🔮 Creating embeddings...")
        self.quotes_data = quotes_data.reset_index(drop=True)

        # Generate embeddings in batches to manage memory
        texts = quotes_data['search_text'].tolist()
        batch_size = 32
        embeddings_list = []

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            batch_embeddings = self.embedding_model.encode(batch_texts, convert_to_tensor=False)
            embeddings_list.append(batch_embeddings)
            print(f"Processed {min(i+batch_size, len(texts))}/{len(texts)} texts")

        self.embeddings = np.vstack(embeddings_list)

        # Create FAISS index
        dimension = self.embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)  # Inner product for similarity

        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(self.embeddings)
        self.index.add(self.embeddings.astype('float32'))

        print(f"Index created with {self.index.ntotal} vectors")
        return self.index

    def retrieve_quotes(self, query, top_k=5):
        """Retrieve relevant quotes for a query"""
        if self.index is None:
            raise ValueError("Index not created. Call create_embeddings() first.")

        # Encode query
        query_embedding = self.embedding_model.encode([query])
        faiss.normalize_L2(query_embedding)

        # Search
        similarities, indices = self.index.search(query_embedding.astype('float32'), top_k)

        # Get results
        results = []
        for i, (similarity, idx) in enumerate(zip(similarities[0], indices[0])):
            if idx < len(self.quotes_data):
                quote_data = self.quotes_data.iloc[idx]
                results.append({
                    'quote': quote_data['quote'],
                    'author': quote_data['author'],
                    'tags': quote_data['tags'],
                    'similarity': float(similarity),
                    'rank': i + 1
                })

        return results

    def multi_hop_search(self, tags=None, author_keywords=None, content_query=None, top_k=10):
        """Advanced multi-hop search with filtering"""
        filtered_data = self.quotes_data.copy()

        # Filter by tags if provided
        if tags:
            tag_mask = filtered_data['tags'].apply(
                lambda x: any(tag.lower() in [t.lower() for t in x] for tag in tags)
            )
            filtered_data = filtered_data[tag_mask]
            print(f"After tag filter ({tags}): {len(filtered_data)} quotes")

        # Filter by author keywords if provided
        if author_keywords:
            author_mask = filtered_data['author'].str.lower().str.contains(
                '|'.join([kw.lower() for kw in author_keywords]),
                na=False,
                regex=True
            )
            filtered_data = filtered_data[author_mask]
            print(f"👤 After author filter ({author_keywords}): {len(filtered_data)} quotes")

        if len(filtered_data) == 0:
            return []

        # If content query provided, do semantic search on filtered data
        if content_query:
            # Get indices of filtered data
            filtered_indices = filtered_data.index.tolist()

            # Create temporary index with filtered embeddings
            filtered_embeddings = self.embeddings[filtered_indices]
            temp_index = faiss.IndexFlatIP(filtered_embeddings.shape[1])
            faiss.normalize_L2(filtered_embeddings)
            temp_index.add(filtered_embeddings.astype('float32'))

            # Search in filtered space
            query_embedding = self.embedding_model.encode([content_query])
            faiss.normalize_L2(query_embedding)
            similarities, indices = temp_index.search(query_embedding.astype('float32'), min(top_k, len(filtered_data)))

            # Map back to original indices
            results = []
            for i, (similarity, idx) in enumerate(zip(similarities[0], indices[0])):
                if idx < len(filtered_indices):
                    original_idx = filtered_indices[idx]
                    quote_data = self.quotes_data.iloc[original_idx]
                    results.append({
                        'quote': quote_data['quote'],
                        'author': quote_data['author'],
                        'tags': quote_data['tags'],
                        'similarity': float(similarity),
                        'rank': i + 1,
                        'filter_applied': True
                    })
            return results
        else:
            # Return filtered data without semantic search
            results = []
            for idx, (_, row) in enumerate(filtered_data.head(top_k).iterrows()):
                results.append({
                    'quote': row['quote'],
                    'author': row['author'],
                    'tags': row['tags'],
                    'similarity': 1.0,  # No semantic similarity calculated
                    'rank': idx + 1,
                    'filter_applied': True
                })
            return results

    def get_analytics(self):
        """Get analytics about the quote dataset"""
        if self.quotes_data is None:
            return None

        analytics = {
            'total_quotes': len(self.quotes_data),
            'unique_authors': self.quotes_data['author'].nunique(),
            'top_authors': self.quotes_data['author'].value_counts().head(10).to_dict(),
            'tag_distribution': {},
            'avg_quote_length': self.quotes_data['quote'].str.len().mean(),
        }

        # Tag analysis
        all_tags = []
        for tags in self.quotes_data['tags']:
            if isinstance(tags, list):
                all_tags.extend([tag.lower() for tag in tags])

        if all_tags:
            from collections import Counter
            tag_counts = Counter(all_tags)
            analytics['tag_distribution'] = dict(tag_counts.most_common(20))
            analytics['total_unique_tags'] = len(tag_counts)

        return analytics

    def query(self, user_query, top_k=5):
        """Complete RAG query processing"""
        retrieved_quotes = self.retrieve_quotes(user_query, top_k)
        return retrieved_quotes

print("Enhanced RAG Pipeline Class defined successfully!")

Enhanced RAG Pipeline Class defined successfully!


Cell 6 - System Initialization:

In [37]:
# Load and Process Data
print("Loading and processing data...")

data_processor = ColabQuoteDataProcessor()
dataset = data_processor.load_data(max_samples=1000)
processed_data = data_processor.preprocess_data()

print(f" Data loaded: {len(processed_data)} quotes")
print(" Ready for next step!")

Loading and processing data...
 Loading dataset from HuggingFace...
Dataset loaded successfully. Total size: 2508
Randomly sampled 1000 quotes for Colab optimization
 Dataset ready with 1000 quotes
Preprocessing data...
Original dataset shape: (1000, 3)
Columns: ['quote', 'author', 'tags']
Sample data:
                                               quote                author  \
0  “If you never did you should. These things are...             Dr. Seuss   
1         “Love all, trust a few, do wrong to none.”  William Shakespeare,   

                             tags  
0                         [suess]  
1  [do-wrong, love, trust, wrong]  
Handling missing values...
Removed 0 rows with missing quote/author
 Processing tags column...
Data preprocessing completed!
Final dataset size: 1000 quotes
Sample search text: Quote: “If you never did you should. These things are fun and fun is good.” Author: Dr. Seuss Tags: ...
 Data loaded: 1000 quotes
 Ready for next step!


In [38]:
# Load Embedding Model
print(" Loading embedding model...")

embedding_model = ColabQuoteEmbeddingModel()
model = embedding_model.load_model()

print(" Model loaded successfully!")
print(" Ready for embedding creation!")

 Loading embedding model...
 Loading model: all-MiniLM-L6-v2
 Model loaded successfully on cpu
 Model loaded successfully!
 Ready for embedding creation!


In [39]:
# Create RAG Pipeline and Embeddings
print(" Creating RAG pipeline and embeddings...")
print(" This may take a few minutes depending on dataset size...")

rag_pipeline = ColabQuoteRAGPipeline(model)

# Add progress tracking
import time
start_time = time.time()

rag_pipeline.create_embeddings(processed_data)

end_time = time.time()
print(f" Embedding creation took {end_time - start_time:.2f} seconds")

# Store in global variable for easy access
rag_system = rag_pipeline

print(" System fully initialized!")
print(f" Ready to search through {len(processed_data)} quotes")
print(" Proceed to next cell for testing!")

 Creating RAG pipeline and embeddings...
 This may take a few minutes depending on dataset size...
🔮 Creating embeddings...
Processed 32/1000 texts
Processed 64/1000 texts
Processed 96/1000 texts
Processed 128/1000 texts
Processed 160/1000 texts
Processed 192/1000 texts
Processed 224/1000 texts
Processed 256/1000 texts
Processed 288/1000 texts
Processed 320/1000 texts
Processed 352/1000 texts
Processed 384/1000 texts
Processed 416/1000 texts
Processed 448/1000 texts
Processed 480/1000 texts
Processed 512/1000 texts
Processed 544/1000 texts
Processed 576/1000 texts
Processed 608/1000 texts
Processed 640/1000 texts
Processed 672/1000 texts
Processed 704/1000 texts
Processed 736/1000 texts
Processed 768/1000 texts
Processed 800/1000 texts
Processed 832/1000 texts
Processed 864/1000 texts
Processed 896/1000 texts
Processed 928/1000 texts
Processed 960/1000 texts
Processed 992/1000 texts
Processed 1000/1000 texts
Index created with 1000 vectors
 Embedding creation took 193.89 seconds
 Syste

Cell 7 - Test Search Function

In [40]:
# Quick Test to Verify System
print(" Testing the system...")

# Simple test function
def quick_test():
    try:
        test_query = "motivation"
        print(f" Testing query: '{test_query}'")

        results = rag_system.query(test_query, top_k=2)

        if results:
            print(" System is working!")
            for i, result in enumerate(results, 1):
                print(f"{i}. \"{result['quote'][:50]}...\" - {result['author']}")
            return True
        else:
            print(" No results found")
            return False

    except Exception as e:
        print(f" Test failed: {e}")
        return False

# Run the test
if quick_test():
    print(" System ready for Gradio interface!")
else:
    print(" Please check previous cells for errors")

 Testing the system...
 Testing query: 'motivation'
 System is working!
1. "“The starting point of all achievement is DESIRE. ..." - Napoleon Hill,
2. "“Of course motivation is not permanent. But then, ..." - Zig Ziglar,
 System ready for Gradio interface!


In [41]:
# Define the main search function for Gradio
def search_quotes(query, num_results=5):
    """Search for quotes based on user query"""
    print(f" Searching for: '{query}'")  # Debug print

    if not query.strip():
        return " Please enter a valid query."

    try:
        # Query the system
        retrieved_quotes = rag_system.query(query, top_k=num_results)

        if not retrieved_quotes:
            return f"No quotes found for: '{query}'"

        # Format response nicely
        response = f" **Search Results for:** '{query}'\n\n"

        for i, quote in enumerate(retrieved_quotes, 1):
            response += f"**{i}. Quote (Similarity: {quote['similarity']:.3f})**\n"
            response += f"💬 \"{quote['quote']}\"\n"
            response += f"👤 **Author:** {quote['author']}\n"

            if quote['tags']:
                response += f" **Tags:** {', '.join(quote['tags'])}\n"

            response += "\n" + "─" * 50 + "\n\n"

        return response

    except Exception as e:
        print(f" Search error: {e}")  # Debug print
        return f" Search failed: {str(e)}"

print(" Search function defined!")

 Search function defined!


Cell 8 - Gradio Interface:

In [42]:
# Enhanced search functions with multi-hop queries and analytics
import json
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

def search_quotes(query, num_results=5):
    """Standard search for quotes based on user query"""
    print(f"Searching for: '{query}'")

    if not query.strip():
        return "Please enter a valid query."

    try:
        retrieved_quotes = rag_system.query(query, top_k=num_results)

        if not retrieved_quotes:
            return f"No quotes found for: '{query}'"

        response = f"**Search Results for:** '{query}'\n\n"

        for i, quote in enumerate(retrieved_quotes, 1):
            response += f"**{i}. Quote (Similarity: {quote['similarity']:.3f})**\n"
            response += f"💬 \"{quote['quote']}\"\n"
            response += f"👤 **Author:** {quote['author']}\n"

            if quote['tags']:
                response += f"**Tags:** {', '.join(quote['tags'])}\n"

            response += "\n" + "─" * 50 + "\n\n"

        return response

    except Exception as e:
        print(f"Search error: {e}")
        return f"Search failed: {str(e)}"

def multi_hop_search(tags_input, author_input, content_query, num_results=10):
    """Advanced multi-hop search with filtering"""
    print(f"Multi-hop search - Tags: {tags_input}, Authors: {author_input}, Content: {content_query}")

    # Parse inputs
    tags = [tag.strip() for tag in tags_input.split(',')] if tags_input.strip() else None
    authors = [author.strip() for author in author_input.split(',')] if author_input.strip() else None
    content = content_query.strip() if content_query.strip() else None

    if not any([tags, authors, content]):
        return "Please provide at least one search criteria."

    try:
        results = rag_system.multi_hop_search(
            tags=tags,
            author_keywords=authors,
            content_query=content,
            top_k=num_results
        )

        if not results:
            return "No quotes found matching your criteria."

        # Format response
        response = f"**Multi-hop Search Results**\n"
        if tags:
            response += f"**Tags:** {', '.join(tags)}\n"
        if authors:
            response += f"**Authors:** {', '.join(authors)}\n"
        if content:
            response += f"**Content:** {content}\n"
        response += f"**Found:** {len(results)} quotes\n\n"

        for i, quote in enumerate(results, 1):
            response += f"**{i}.💭Quote**"
            if 'similarity' in quote and quote['similarity'] < 1.0:
                response += f" (Similarity: {quote['similarity']:.3f})"
            response += "\n"
            response += f"\"{quote['quote']}\"\n"
            response += f"**👤Author:** {quote['author']}\n"

            if quote['tags']:
                response += f"**Tags:** {', '.join(quote['tags'])}\n"

            response += "\n" + "─" * 50 + "\n\n"

        return response

    except Exception as e:
        print(f"Multi-hop search error: {e}")
        return f"Multi-hop search failed: {str(e)}"

def generate_analytics():
    """Generate analytics and visualizations"""
    try:
        analytics = rag_system.get_analytics()
        if not analytics:
            return "Analytics not available"

        # Create visualizations
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle('Quote Dataset Analytics', fontsize=16, fontweight='bold')

        # 1. Top Authors
        top_authors = analytics['top_authors']
        if top_authors:
            authors = list(top_authors.keys())[:8]  # Top 8
            counts = list(top_authors.values())[:8]

            axes[0,0].barh(authors, counts, color='skyblue')
            axes[0,0].set_title('👤 Top Authors by Quote Count')
            axes[0,0].set_xlabel('Number of Quotes')

            # Add value labels
            for i, v in enumerate(counts):
                axes[0,0].text(v + 0.1, i, str(v), va='center')

        # 2. Tag Distribution
        tag_dist = analytics['tag_distribution']
        if tag_dist:
            tags = list(tag_dist.keys())[:10]  # Top 10
            tag_counts = list(tag_dist.values())[:10]

            axes[0,1].bar(range(len(tags)), tag_counts, color='lightgreen')
            axes[0,1].set_title('Top Tags Distribution')
            axes[0,1].set_xlabel('Tags')
            axes[0,1].set_ylabel('Frequency')
            axes[0,1].set_xticks(range(len(tags)))
            axes[0,1].set_xticklabels(tags, rotation=45, ha='right')

        # 3. Quote Length Distribution
        quote_lengths = rag_system.quotes_data['quote'].str.len()
        axes[1,0].hist(quote_lengths, bins=30, alpha=0.7, color='coral')
        axes[1,0].set_title('Quote Length Distribution')
        axes[1,0].set_xlabel('Quote Length (characters)')
        axes[1,0].set_ylabel('Frequency')
        axes[1,0].axvline(analytics['avg_quote_length'], color='red', linestyle='--',
                         label=f'Avg: {analytics["avg_quote_length"]:.0f}')
        axes[1,0].legend()

        # 4. Summary Stats
        axes[1,1].axis('off')
        summary_text = f"""
        Dataset Summary

        Total Quotes: {analytics['total_quotes']:,}
        Unique Authors: {analytics['unique_authors']:,}
        Unique Tags: {analytics.get('total_unique_tags', 'N/A')}
        Avg Quote Length: {analytics['avg_quote_length']:.0f} chars

        Most Prolific Author:
        {list(top_authors.keys())[0] if top_authors else 'N/A'}
        ({list(top_authors.values())[0] if top_authors else 0} quotes)

        Most Common Tag:
        {list(tag_dist.keys())[0] if tag_dist else 'N/A'}
        ({list(tag_dist.values())[0] if tag_dist else 0} occurrences)
        """

        axes[1,1].text(0.1, 0.9, summary_text, transform=axes[1,1].transAxes,
                      fontsize=12, verticalalignment='top', fontfamily='monospace')

        plt.tight_layout()
        plt.show()

        return f"Analytics generated! Check the visualizations above."

    except Exception as e:
        print(f"Analytics error: {e}")
        return f"Analytics generation failed: {str(e)}"

def prepare_download_data(search_results_text, search_type="standard"):
    """Prepare search results for JSON download"""
    try:
        # This is a simplified version - in a full implementation,
        # you'd want to store the actual search results data structure
        download_data = {
            "timestamp": datetime.now().isoformat(),
            "search_type": search_type,
            "results_text": search_results_text,
            "metadata": {
                "total_quotes_in_db": len(rag_system.quotes_data) if rag_system.quotes_data is not None else 0,
                "search_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }
        }

        # Convert to JSON string
        json_str = json.dumps(download_data, indent=2, ensure_ascii=False)

        return json_str

    except Exception as e:
        return f'{{"error": "Failed to prepare download: {str(e)}"}}'

print("Enhanced search functions defined!")

Enhanced search functions defined!


Cell 9 - Launch Interface

In [46]:
# Enhanced Gradio Interface without analytics
def create_enhanced_interface():
    """Create enhanced Gradio interface with multi-hop search (no analytics)"""

    with gr.Blocks(title="Advanced Quote Search System", theme=gr.themes.Soft()) as demo:
        gr.Markdown("# Advanced Quote Search System")
        gr.Markdown("*Semantic search with multi-hop queries and downloads*")

        with gr.Tabs():
            # Tab 1: Standard Search
            with gr.TabItem("Standard Search"):
                gr.Markdown("### Simple semantic search for quotes")

                with gr.Row():
                    with gr.Column(scale=4):
                        query_input = gr.Textbox(
                            label="Enter your search query",
                            placeholder="e.g., 'quotes about love', 'motivation quotes', 'Steve Jobs quotes'",
                            lines=2
                        )
                    with gr.Column(scale=1):
                        num_results = gr.Slider(
                            label="Number of results",
                            minimum=1,
                            maximum=15,
                            value=5,
                            step=1
                        )

                search_btn = gr.Button("Search Quotes", variant="primary", size="lg")

                standard_results = gr.Markdown(
                    label="Search Results",
                    value="Enter a query and click 'Search Quotes' to see results."
                )

                # Download section for standard search
                with gr.Row():
                    download_std_btn = gr.Button("Prepare Download", size="sm")
                    download_std_file = gr.File(label="Download Results (JSON)", visible=False)

                search_btn.click(
                    search_quotes,
                    inputs=[query_input, num_results],
                    outputs=standard_results
                )

                download_std_btn.click(
                    lambda results: prepare_download_data(results, "standard"),
                    inputs=standard_results,
                    outputs=gr.Textbox(visible=False)
                ).then(
                    lambda json_data: gr.File.update(
                        value=json_data,
                        visible=True,
                        filename=f"quote_search_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
                    ),
                    inputs=gr.Textbox(visible=False),
                    outputs=download_std_file
                )

            # Tab 2: Multi-hop Search
            with gr.TabItem("Multi-hop Search"):
                gr.Markdown("### Advanced search with multiple criteria")
                gr.Markdown("*Search by tags, authors, and content simultaneously*")

                with gr.Row():
                    with gr.Column():
                        tags_input = gr.Textbox(
                            label="Tags (comma-separated)",
                            placeholder="e.g., life, love, motivation",
                            lines=1
                        )
                        author_input = gr.Textbox(
                            label="Author keywords (comma-separated)",
                            placeholder="e.g., Einstein, Jobs, Roosevelt",
                            lines=1
                        )
                        content_input = gr.Textbox(
                            label="Content query",
                            placeholder="e.g., overcoming challenges",
                            lines=2
                        )
                    with gr.Column(scale=1):
                        multi_num_results = gr.Slider(
                            label="Max results",
                            minimum=1,
                            maximum=20,
                            value=10,
                            step=1
                        )

                multi_search_btn = gr.Button("Multi-hop Search", variant="primary", size="lg")

                multi_results = gr.Markdown(
                    label="Multi-hop Search Results",
                    value="Configure your search criteria above and click 'Multi-hop Search'."
                )

                # Download section for multi-hop search
                with gr.Row():
                    download_multi_btn = gr.Button("Prepare Download", size="sm")
                    download_multi_file = gr.File(label="Download Results (JSON)", visible=False)

                multi_search_btn.click(
                    multi_hop_search,
                    inputs=[tags_input, author_input, content_input, multi_num_results],
                    outputs=multi_results
                )

                download_multi_btn.click(
                    lambda results: prepare_download_data(results, "multi_hop"),
                    inputs=multi_results,
                    outputs=gr.Textbox(visible=False)
                ).then(
                    lambda json_data: gr.File.update(
                        value=json_data,
                        visible=True,
                        filename=f"multihop_search_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
                    ),
                    inputs=gr.Textbox(visible=False),
                    outputs=download_multi_file
                )

                # Examples for multi-hop
                gr.Markdown("### Multi-hop Examples:")
                with gr.Row():
                    gr.Button("Life + Love quotes", size="sm").click(
                        lambda: ("life, love", "", ""),
                        outputs=[tags_input, author_input, content_input]
                    )
                    gr.Button("Einstein quotes about science", size="sm").click(
                        lambda: ("", "Einstein", "science and discovery"),
                        outputs=[tags_input, author_input, content_input]
                    )
                    gr.Button("20th century motivation", size="sm").click(
                        lambda: ("motivation, success", "Roosevelt, Churchill, Jobs", ""),
                        outputs=[tags_input, author_input, content_input]
                    )

        # Quick Examples (bottom of interface)
        gr.Markdown("---")
        gr.Markdown("### Quick Start Examples")

        example_queries = [
            "quotes about perseverance",
            "wisdom and knowledge",
            "Steve Jobs innovation",
            "life philosophy",
            "success and failure",
            "love and relationships"
        ]

        with gr.Row():
            for query in example_queries[:3]:
                gr.Button(query, size="sm").click(
                    lambda q=query: q, outputs=query_input
                )

        with gr.Row():
            for query in example_queries[3:]:
                gr.Button(query, size="sm").click(
                    lambda q=query: q, outputs=query_input
                )

    return demo

# Create the enhanced interface
print("Creating enhanced Gradio interface...")
demo = create_enhanced_interface()
print("Enhanced interface created successfully!")

# Fix for port error - try multiple ports
def launch_with_fallback_port(demo, start_port=7860):
    """Launch demo with fallback ports if the default is occupied"""
    max_attempts = 10

    for i in range(max_attempts):
        try:
            port = start_port + i
            print(f"Attempting to launch on port {port}...")

            demo.launch(
                share=True,  # Creates public URL for Colab
                server_name="0.0.0.0",  # Allow external connections
                server_port=port,
                show_error=True,  # Show detailed errors
                quiet=False  # Show startup logs
            )
            print(f"SUCCESS! Interface launched on port {port}")
            break

        except OSError as e:
            if "Cannot find empty port" in str(e) and i < max_attempts - 1:
                print(f"Port {port} is busy, trying next port...")
                continue
            else:
                print(f"Failed to launch after {max_attempts} attempts")
                print("Try restarting your runtime or manually specify a different port")
                raise e
        except Exception as e:
            print(f"Unexpected error: {e}")
            raise e

# Launch the interface with port fallback
print("Launching interface...")
print("The interface will be accessible via the public URL below")
print("Click the link to open the Quote Search System")

try:
    launch_with_fallback_port(demo)
    print("SUCCESS! Interface is now running!")
except Exception as e:
    print(f"Launch failed: {e}")
    print("\nTroubleshooting:")
    print("1. Try restarting your Colab runtime")
    print("2. Run the cleanup cell (Cell 10) first")
    print("3. Then re-run cells 1-9")

Creating enhanced Gradio interface...
Enhanced interface created successfully!
Launching interface...
The interface will be accessible via the public URL below
Click the link to open the Quote Search System
Attempting to launch on port 7860...
Port 7860 is busy, trying next port...
Attempting to launch on port 7861...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://219039497a88332166.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


SUCCESS! Interface launched on port 7861
SUCCESS! Interface is now running!
