In [2]:
!pip install datasets transformers sentence-transformers faiss-cpu torch pandas numpy
!pip install streamlit ragas langchain openai chromadb




In [3]:
# Import Libraries and Setup

import pandas as pd
import numpy as np
import json
import re
import pickle
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

In [4]:
# ML and NLP
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import torch

In [5]:
# Vector Search
import faiss
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
# Evaluation
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall

In [7]:
# Streamlit components
import streamlit as st

print("All libraries imported successfully!")

All libraries imported successfully!


In [11]:
# Data Loading and Exploration

# Load dataset from HuggingFace hub
df = pd.read_json("hf://datasets/Abirate/english_quotes/quotes.jsonl", lines=True)

# Clean and filter out missing or bad values
df = df.dropna(subset=["quote", "author"])
df = df[df["quote"].str.strip() != ""]

# Rename and keep only what RAG needs: 'title' and 'text'
df_rag = df[["quote", "author"]].copy()
df_rag.columns = ["text", "title"]  # RAG expects these column names

# Save to CSV so we can use it later
df_rag.to_csv("quotes.csv", index=False)

# Quick preview
print(f"Dataset shape: {df_rag.shape}")
print(f"Columns: {df_rag.columns.tolist()}")
print("\nFirst 5 rows:")
print(df_rag.head())

#Basic statistics
print(f"\nDataset Info:")
print(f"Total quotes: {len(df_rag)}")
print(f"Unique authors: {df_rag['title'].nunique()}")

# Also keep original df with tags for additional processing if available
if 'tags' in df.columns:
    print(f"Tags available in original dataset")
    df_with_tags = df[["quote", "author", "tags"]].copy()
    print(f"Sample tags: {df_with_tags['tags'].dropna().head(3).tolist()}")
else:
    print("No tags column found in dataset")
    df_with_tags = df[["quote", "author"]].copy()
    df_with_tags['tags'] = ""

Dataset shape: (2508, 2)
Columns: ['text', 'title']

First 5 rows:
                                                text                  title
0     “Be yourself; everyone else is already taken.”            Oscar Wilde
1  “I'm selfish, impatient and a little insecure....         Marilyn Monroe
2  “Two things are infinite: the universe and hum...        Albert Einstein
3                   “So many books, so little time.”            Frank Zappa
4  “A room without books is like a body without a...  Marcus Tullius Cicero

Dataset Info:
Total quotes: 2508
Unique authors: 880
Tags available in original dataset
Sample tags: [['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator'], ['best', 'life', 'love', 'mistakes', 'out-of-control', 'truth', 'worst'], ['human-nature', 'humor', 'infinity', 'philosophy', 'science', 'stupidity', 'universe']]


In [12]:
# Data Preprocessing and Cleaning

def preprocess_text(text):
    """Clean and preprocess text"""
    if pd.isna(text):
        return ""
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', str(text)).strip()
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s\.,!?;:\'"()-]', '', text)
    return text

def preprocess_quotes_data(df_rag, df_with_tags):
    """Preprocess the quotes dataset"""
    # Create a copy of the RAG dataframe
    df_clean = df_rag.copy()
    
    # Handle missing values (should be minimal after initial cleaning)
    df_clean['text'] = df_clean['text'].fillna('')
    df_clean['title'] = df_clean['title'].fillna('Unknown')
    
    # Clean text fields
    df_clean['text_clean'] = df_clean['text'].apply(preprocess_text)
    df_clean['title_clean'] = df_clean['title'].apply(preprocess_text)
    
    # Remove empty quotes
    df_clean = df_clean[df_clean['text_clean'] != ''].reset_index(drop=True)
    
    # Add tags if available from original dataset
    if len(df_with_tags) == len(df_clean) and 'tags' in df_with_tags.columns:
        df_clean['tags'] = df_with_tags['tags'].fillna('')
        # Create combined text for embedding with tags
        df_clean['combined_text'] = (
            df_clean['text_clean'] + ' [AUTHOR] ' + 
            df_clean['title_clean'] + ' [TAGS] ' + 
            df_clean['tags'].astype(str)
        )
    else:
        # Create combined text without tags
        df_clean['combined_text'] = (
            df_clean['text_clean'] + ' [AUTHOR] ' + 
            df_clean['title_clean']
        )
        df_clean['tags'] = ''
    
    # For compatibility with rest of code, create quote and author columns
    df_clean['quote'] = df_clean['text']
    df_clean['author'] = df_clean['title']
    
    return df_clean

# Preprocess the data
df_processed = preprocess_quotes_data(df_rag, df_with_tags)
print(f"Processed dataset shape: {df_processed.shape}")
print(f"Sample combined text:")
print(df_processed['combined_text'].iloc[0])

Processed dataset shape: (2507, 8)
Sample combined text:
Be yourself; everyone else is already taken. [AUTHOR] Oscar Wilde


In [13]:
# Fine-tuning Sentence Transformer Model

class QuotesSentenceTransformer:
    def __init__(self, base_model='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(base_model)
        self.base_model = base_model
        
    def create_training_examples(self, df, num_examples=1000):
        """Create training examples for fine-tuning"""
        examples = []
        
        # Sample data for training
        sample_df = df.sample(n=min(num_examples, len(df)), random_state=42)
        
        for _, row in sample_df.iterrows():
            # Create positive examples (query-text pairs)
            quote = row.get('text_clean', row.get('quote', ''))
            author = row.get('title_clean', row.get('author', ''))
            tags = str(row.get('tags', ''))
            combined = row['combined_text']
            
            # Generate different query styles for the same quote
            queries = [
                f"quotes by {author}",
                f"quotes about {tags.split(',')[0] if tags and tags != '' else 'life'}",
                f"{author} quotes",
                f"inspirational quotes by {author}",
                quote[:50] + "..." if len(quote) > 50 else quote
            ]
            
            for query in queries:
                if query.strip() and query.strip() != "quotes about ":
                    examples.append(InputExample(texts=[query, combined], label=1.0))
        
        return examples
    
    def fine_tune(self, df, epochs=1, batch_size=16):
        """Fine-tune the model on quotes data"""
        print("Creating training examples...")
        train_examples = self.create_training_examples(df)
        print(f"Created {len(train_examples)} training examples")
        
        # Create DataLoader
        train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
        
        # Define loss function
        train_loss = losses.CosineSimilarityLoss(self.model)
        
        print("Starting fine-tuning...")
        # Fine-tune the model
        self.model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=epochs,
            warmup_steps=100,
            show_progress_bar=True
        )
        
        print("Fine-tuning completed!")
    
    def encode(self, texts):
        """Encode texts to embeddings"""
        return self.model.encode(texts, show_progress_bar=True)
    
    def save_model(self, path):
        """Save the fine-tuned model"""
        self.model.save(path)
        print(f"Model saved to {path}")
    
    def load_model(self, path):
        """Load a fine-tuned model"""
        self.model = SentenceTransformer(path)
        print(f"Model loaded from {path}")

# Initialize and fine-tune the model
quotes_model = QuotesSentenceTransformer()

# Fine-tune on a subset for demonstration (use more epochs and data for production)
quotes_model.fine_tune(df_processed, epochs=1, batch_size=16)

# Save the model
model_path = "./fine_tuned_quotes_model"
quotes_model.save_model(model_path)

Creating training examples...
Created 5000 training examples
Starting fine-tuning...


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/313 [00:00<?, ?it/s]

Fine-tuning completed!
Model saved to ./fine_tuned_quotes_model


In [14]:
# Vector Indexing with FAISS

class QuotesVectorIndex:
    def __init__(self, model, df):
        self.model = model
        self.df = df
        self.index = None
        self.embeddings = None
        
    def create_index(self):
        """Create FAISS index from quotes embeddings"""
        print("Generating embeddings for all quotes...")
        
        # Generate embeddings for all quotes
        texts = self.df['combined_text'].tolist()
        self.embeddings = self.model.encode(texts)
        
        # Create FAISS index
        dimension = self.embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
        
        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(self.embeddings)
        
        # Add embeddings to index
        self.index.add(self.embeddings.astype(np.float32))
        
        print(f"Index created with {self.index.ntotal} quotes")
        
    def search(self, query, top_k=5):
        """Search for similar quotes"""
        if self.index is None:
            raise ValueError("Index not created. Call create_index() first.")
        
        # Encode query
        query_embedding = self.model.encode([query])
        faiss.normalize_L2(query_embedding)
        
        # Search
        scores, indices = self.index.search(query_embedding.astype(np.float32), top_k)
        
        # Get results
        results = []
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            if idx != -1:  # Valid result
                row = self.df.iloc[idx]
                results.append({
                    'rank': i + 1,
                    'quote': row.get('quote', row.get('text', '')),
                    'author': row.get('author', row.get('title', '')),
                    'tags': row.get('tags', ''),
                    'similarity_score': float(score),
                    'index': int(idx)
                })
        
        return results
    
    def save_index(self, index_path, data_path):
        """Save index and associated data"""
        faiss.write_index(self.index, index_path)
        with open(data_path, 'wb') as f:
            pickle.dump({
                'embeddings': self.embeddings,
                'df': self.df
            }, f)
        print(f"Index saved to {index_path}")
        print(f"Data saved to {data_path}")

# Create vector index
vector_index = QuotesVectorIndex(quotes_model, df_processed)
vector_index.create_index()

# Save index
vector_index.save_index("quotes_index.faiss", "quotes_data.pkl")

# Test the search
test_query = "quotes about hope by Oscar Wilde"
test_results = vector_index.search(test_query, top_k=3)
print(f"\nTest search for: '{test_query}'")
for result in test_results:
    print(f"Rank {result['rank']}: {result['quote'][:100]}... by {result['author']} (Score: {result['similarity_score']:.4f})")

Generating embeddings for all quotes...


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Index created with 2507 quotes
Index saved to quotes_index.faiss
Data saved to quotes_data.pkl


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Test search for: 'quotes about hope by Oscar Wilde'
Rank 1: “Hearts are made to be broken.”... by Oscar Wilde, (Score: 0.9931)
Rank 2: “Who, being loved, is poor?”... by Oscar Wilde (Score: 0.9930)
Rank 3: “A good friend will always stab you in the front.”... by Oscar Wilde (Score: 0.9927)


In [39]:
# RAG Pipeline Implementation with LLM

import subprocess
import sys

def install_requirements():
    """Install required packages for LLM"""
    packages = ['transformers', 'torch', 'accelerate']
    for package in packages:
        try:
            __import__(package)
            print(f" {package} already installed")
        except ImportError:
            print(f" Installing {package}...")
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])


# Import required libraries
from transformers import pipeline
import torch
import warnings
warnings.filterwarnings('ignore')

class RAGQuotesPipeline:
    def __init__(self, vector_index, use_llm=True):
        self.vector_index = vector_index
        self.use_llm = use_llm
        self.llm = None
        
        # Initialize free LLM
        if use_llm:
            self._load_llm()
        
    def _load_llm(self):
        """Load the free LLM model"""
        print(" Loading free LLM model for demonstration...")
        try:
            # Use a lightweight text generation model
            self.llm = pipeline(
                "text-generation",
                model="distilgpt2",  # Lightweight and fast
                tokenizer="distilgpt2",
                device=0 if torch.cuda.is_available() else -1,
                max_length=150,
                do_sample=True,
                temperature=0.7
            )
            print("✅ LLM loaded successfully!")
        except Exception as e:
            print(f"❌ Error loading LLM: {e}")
            print("📝 Falling back to template-based responses")
            self.llm = None
            self.use_llm = False
    
    def retrieve_quotes(self, query, top_k=5):
        """Retrieve relevant quotes for a query"""
        return self.vector_index.search(query, top_k)
    
    def generate_response_simple(self, query, retrieved_quotes):
        """Generate response using simple template (no external LLM needed)"""
        if not retrieved_quotes:
            return {
                'query': query,
                'summary': 'No relevant quotes found for your query.',
                'quotes': [],
                'total_found': 0,
                'llm_generated': False
            }
        
        # Create summary
        authors = list(set([q['author'] for q in retrieved_quotes if q['author'] != 'Unknown']))
        tags = []
        for q in retrieved_quotes:
            if q['tags']:
                tags.extend(str(q['tags']).split(','))
        
        unique_tags = list(set([tag.strip() for tag in tags if tag.strip()]))[:5]
        
        summary = f"Found {len(retrieved_quotes)} relevant quotes"
        if authors:
            summary += f" from authors including {', '.join(authors[:3])}"
        if unique_tags:
            summary += f" related to themes like {', '.join(unique_tags[:3])}"
        
        return {
            'query': query,
            'summary': summary,
            'quotes': retrieved_quotes,
            'total_found': len(retrieved_quotes),
            'authors': authors,
            'themes': unique_tags,
            'llm_generated': False
        }
    
    def generate_response_with_llm(self, query, retrieved_quotes):
        """Generate response using free LLM"""
        if not self.llm or not retrieved_quotes:
            return self.generate_response_simple(query, retrieved_quotes)
        
        try:
            # Get basic info
            authors = list(set([q['author'] for q in retrieved_quotes if q['author'] != 'Unknown']))
            
            # Prepare context for LLM (keep it concise)
            context = f"Query: {query}\n\nTop quotes:\n"
            for i, quote in enumerate(retrieved_quotes[:2], 1):  # Only use top 2 quotes
                context += f"{i}. \"{quote['quote'][:100]}...\" - {quote['author']}\n"
            
            # Create a focused prompt
            prompt = f"""Analyze these quotes about "{query}":

{context}

Key insights:"""
            
            # Generate response with LLM
            response = self.llm(
                prompt,
                max_new_tokens=80,
                num_return_sequences=1,
                pad_token_id=self.llm.tokenizer.eos_token_id,
                truncation=True
            )
            
            # Extract generated text
            generated_text = response[0]['generated_text']
            llm_analysis = generated_text[len(prompt):].strip()
            
            # Clean up the response
            if llm_analysis:
                # Take first complete sentence
                sentences = llm_analysis.split('.')
                if len(sentences) > 1:
                    llm_analysis = sentences[0] + '.'
                else:
                    llm_analysis = llm_analysis[:100] + "..."
            else:
                llm_analysis = "These quotes provide valuable insights on your query."
            
            # Combine with retrieved data
            return {
                'query': query,
                'summary': f" AI Analysis: {llm_analysis}",
                'quotes': retrieved_quotes,
                'total_found': len(retrieved_quotes),
                'authors': authors,
                'themes': [],
                'llm_generated': True
            }
            
        except Exception as e:
            print(f"⚠️ LLM generation error: {e}")
            return self.generate_response_simple(query, retrieved_quotes)
    
    def search_and_generate(self, query, top_k=5, use_llm=None):
        """Complete RAG pipeline: retrieve and generate"""
        # Use instance setting if not specified
        if use_llm is None:
            use_llm = self.use_llm
            
        # Retrieve
        retrieved_quotes = self.retrieve_quotes(query, top_k)
        
        # Generate
        if use_llm and self.llm:
            response = self.generate_response_with_llm(query, retrieved_quotes)
        else:
            response = self.generate_response_simple(query, retrieved_quotes)
        
        return response

# Initialize RAG pipeline with LLM
print("🔧 Initializing RAG Pipeline with LLM...")
rag_pipeline = RAGQuotesPipeline(vector_index, use_llm=True)

# Test the complete pipeline
test_queries = [
    "Show me quotes about courage by women authors",
    "inspirational quotes about success",
    "quotes about love by Shakespeare",
    "motivational quotes for difficult times"
]

print("\n Testing RAG Pipeline with LLM:")
for query in test_queries:
    print(f"\n{'='*50}")
    print(f" Query: {query}")
    
    # Test with LLM
    result = rag_pipeline.search_and_generate(query, top_k=3, use_llm=True)
    print(f" Summary: {result['summary']}")
    print(f" Found {result['total_found']} quotes:")
    for i, quote in enumerate(result['quotes'][:2], 1):
        print(f"   {i}. \"{quote['quote'][:80]}...\" - {quote['author']}")
    
    if result['llm_generated']:
        print(" Response generated with LLM")
    else:
        print(" Response generated with template")

print(f"\n RAG Pipeline Setup Complete!")
print(f" LLM Status: {'Active' if rag_pipeline.llm else 'Inactive'}")

🔧 Initializing RAG Pipeline with LLM...
 Loading free LLM model for demonstration...


Device set to use cpu


✅ LLM loaded successfully!

 Testing RAG Pipeline with LLM:

 Query: Show me quotes about courage by women authors


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 Summary:  AI Analysis: 1.
 Found 3 quotes:
   1. "“My rapier wit hides my inner pain.”..." - Cassandra Clare
   2. "“A brave man acknowledges the strength of others.”..." - Veronica Roth,
 Response generated with LLM

 Query: inspirational quotes about success


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 Summary:  AI Analysis: 1.
 Found 3 quotes:
   1. "“The best way to cheer yourself is to try to cheer someone else up.”..." - Mark Twain
   2. "“Success is stumbling from failure to failure with no loss of enthusiasm.”..." - Winston S. Churchill
 Response generated with LLM

 Query: quotes about love by Shakespeare


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 Summary:  AI Analysis: 1.
 Found 3 quotes:
   1. "“Stars, hide your fires; Let not light see my black and deep desires.”..." - William Shakespeare,
   2. "“Words are easy, like the wind; Faithful friends are hard to find.”..." - William Shakespeare,
 Response generated with LLM

 Query: motivational quotes for difficult times


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 Summary:  AI Analysis: 1.
 Found 3 quotes:
   1. "“Courage isn't having the strength to go on - it is going on when you don't have..." - Napoleon Bonaparte
   2. "“The most painful thing is losing yourself in the process of loving someone too ..." - Ernest Hemingway,
 Response generated with LLM

 RAG Pipeline Setup Complete!
 LLM Status: Active


In [40]:
# RAG Evaluation using RAGAS

def create_evaluation_dataset(rag_pipeline, num_samples=20):
    """Create evaluation dataset for RAGAS"""
    
    # Sample evaluation queries
    eval_queries = [
        "quotes about love",
        "inspirational quotes by Maya Angelou",
        "quotes about success and hard work",
        "funny quotes about life",
        "quotes about friendship",
        "motivational quotes for students",
        "quotes about happiness",
        "quotes by Albert Einstein",
        "quotes about courage and bravery",
        "quotes about change and growth",
        "quotes about leadership",
        "quotes about dreams and aspirations",
        "quotes about time and life",
        "quotes about family",
        "quotes about wisdom",
        "quotes by Shakespeare",
        "quotes about perseverance",
        "quotes about learning",
        "quotes about kindness",
        "quotes about hope and faith"
    ]
    
    evaluation_data = []
    
    for query in eval_queries[:num_samples]:
        try:
            # Get RAG response
            result = rag_pipeline.search_and_generate(query, top_k=3)
            
            # Create contexts from retrieved quotes
            contexts = [f"Quote: {q['quote']} - Author: {q['author']} - Tags: {q['tags']}" 
                       for q in result['quotes']]
            
            # Create ground truth (simplified)
            ground_truth = f"Relevant quotes about {query.replace('quotes about ', '').replace('quotes by ', '')}"
            
            evaluation_data.append({
                'question': query,
                'contexts': contexts,
                'answer': result['summary'],
                'ground_truth': ground_truth
            })
        except Exception as e:
            print(f"Error processing query '{query}': {e}")
            continue
    
    return evaluation_data

# Create evaluation dataset
print("Creating evaluation dataset...")
eval_data = create_evaluation_dataset(rag_pipeline, num_samples=10)
print(f"Created {len(eval_data)} evaluation samples")

# Display sample evaluation data
print("\nSample evaluation data:")
if eval_data:
    sample = eval_data[0]
    print(f"Question: {sample['question']}")
    print(f"Answer: {sample['answer']}")
    print(f"Contexts: {len(sample['contexts'])} contexts")
    print(f"Ground Truth: {sample['ground_truth']}")

# Convert to RAGAS format (simplified evaluation)
def evaluate_rag_simple(eval_data):
    """Simple RAG evaluation without external APIs"""
    scores = {
        'context_relevance': [],
        'answer_quality': [],
        'retrieval_accuracy': []
    }
    
    for item in eval_data:
        # Simple heuristic scoring
        contexts = item['contexts']
        answer = item['answer']
        question = item['question']
        
        # Context relevance (check if contexts contain relevant keywords)
        question_words = set(question.lower().split())
        context_words = set(' '.join(contexts).lower().split())
        relevance_score = len(question_words.intersection(context_words)) / len(question_words)
        scores['context_relevance'].append(min(relevance_score, 1.0))
        
        # Answer quality (length and information content)
        answer_quality = min(len(answer.split()) / 20, 1.0)  # Normalize by expected length
        scores['answer_quality'].append(answer_quality)
        
        # Retrieval accuracy (number of contexts found)
        retrieval_accuracy = min(len(contexts) / 3, 1.0)  # Expecting 3 contexts
        scores['retrieval_accuracy'].append(retrieval_accuracy)
    
    # Calculate averages
    avg_scores = {metric: np.mean(scores[metric]) for metric in scores}
    return avg_scores

# Evaluate RAG performance
print("\nEvaluating RAG Pipeline...")
evaluation_results = evaluate_rag_simple(eval_data)
print("Evaluation Results:")
for metric, score in evaluation_results.items():
    print(f"{metric}: {score:.3f}")

Creating evaluation dataset...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Created 10 evaluation samples

Sample evaluation data:
Question: quotes about love
Answer:  AI Analysis: 2.
Contexts: 3 contexts
Ground Truth: Relevant quotes about love

Evaluating RAG Pipeline...
Evaluation Results:
context_relevance: 0.202
answer_quality: 0.150
retrieval_accuracy: 1.000

  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8504
  Network URL: http://192.168.1.4:8504



2025-06-12 21:13:45.829 Examining the path of torch.classes raised:
Traceback (most recent call last):
  File "C:\Users\ASUS\anaconda3\Lib\site-packages\streamlit\web\bootstrap.py", line 347, in run
    if asyncio.get_running_loop().is_running():
       ^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: no running event loop

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\ASUS\anaconda3\Lib\site-packages\streamlit\watcher\local_sources_watcher.py", line 217, in get_module_paths
    potential_paths = extract_paths(module)
                      ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\anaconda3\Lib\site-packages\streamlit\watcher\local_sources_watcher.py", line 210, in <lambda>
    lambda m: list(m.__path__._path),
                   ^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\anaconda3\Lib\site-packages\torch\_classes.py", line 13, in __getattr__
    proxy = torch._C._get_custom_class_python_wrapper(self.name, attr)
         


  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8503
  Network URL: http://192.168.1.4:8503



2025-06-12 21:10:20.191 Examining the path of torch.classes raised:
Traceback (most recent call last):
  File "C:\Users\ASUS\anaconda3\Lib\site-packages\streamlit\web\bootstrap.py", line 347, in run
    if asyncio.get_running_loop().is_running():
       ^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: no running event loop

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\ASUS\anaconda3\Lib\site-packages\streamlit\watcher\local_sources_watcher.py", line 217, in get_module_paths
    potential_paths = extract_paths(module)
                      ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\anaconda3\Lib\site-packages\streamlit\watcher\local_sources_watcher.py", line 210, in <lambda>
    lambda m: list(m.__path__._path),
                   ^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\anaconda3\Lib\site-packages\torch\_classes.py", line 13, in __getattr__
    proxy = torch._C._get_custom_class_python_wrapper(self.name, attr)
         

In [35]:
# Retrieval Augmented Generation for English Quotes with AI Enhancement

import pandas as pd
import json
import pickle
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import sys
import os
import time
from IPython.display import display, HTML, Markdown
import warnings
warnings.filterwarnings('ignore')

# Try to import LLM dependencies
try:
    from transformers import pipeline
    import torch
    LLM_AVAILABLE = True
    print(" LLM dependencies available")
except ImportError:
    LLM_AVAILABLE = False
    print("⚠️ LLM dependencies not available. Install with: pip install transformers torch")

class JupyterRAGPipeline:
    def __init__(self):
        self.model = None
        self.index = None
        self.df = None
        self.llm = None
        self.loaded = False
    
    def load_model_and_index(self):
        """Load the fine-tuned model and vector index"""
        try:
            print("🔄 Loading model and index...")
            
            # Load model
            self.model = SentenceTransformer("./fine_tuned_quotes_model")
            print("✅ Model loaded")
            
            # Load FAISS index
            self.index = faiss.read_index("quotes_index.faiss")
            print("✅ FAISS index loaded")
            
            # Load data
            with open("quotes_data.pkl", "rb") as f:
                data = pickle.load(f)
            self.df = data['df']
            print("✅ Data loaded")
            
            self.loaded = True
            print(f"📊 Dataset contains {len(self.df)} quotes")
            return True
            
        except Exception as e:
            print(f"❌ Error loading model and index: {e}")
            return False
    
    def load_llm_model(self):
        """Load the free LLM model"""
        if not LLM_AVAILABLE:
            print("⚠️ LLM not available")
            return False
        
        try:
            print("🤖 Loading AI model... This may take a moment.")
            self.llm = pipeline(
                "text-generation",
                model="distilgpt2",
                tokenizer="distilgpt2",
                device=0 if torch.cuda.is_available() else -1,
                max_length=150,
                do_sample=True,
                temperature=0.7
            )
            print(" AI model loaded successfully!")
            return True
        except Exception as e:
            print(f"⚠️ Could not load AI model: {e}")
            return False
    
    def search_quotes(self, query, top_k=5):
        """Search for quotes using the RAG pipeline"""
        if not self.loaded:
            print("❌ Model not loaded. Please run load_model_and_index() first.")
            return []
        
        try:
            # Encode query
            query_embedding = self.model.encode([query])
            faiss.normalize_L2(query_embedding)
            
            # Search
            scores, indices = self.index.search(query_embedding.astype(np.float32), top_k)
            
            # Get results
            results = []
            for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
                if idx != -1:
                    row = self.df.iloc[idx]
                    results.append({
                        'rank': i + 1,
                        'quote': row.get('quote', row.get('text', '')),
                        'author': row.get('author', row.get('title', '')),
                        'tags': row.get('tags', ''),
                        'similarity_score': float(score),
                        'index': int(idx)
                    })
            
            return results
        except Exception as e:
            print(f"❌ Search error: {e}")
            return []
    
    def generate_response_simple(self, query, results):
        """Generate a structured response without LLM"""
        if not results:
            return {
                'query': query,
                'summary': 'No relevant quotes found for your query.',
                'quotes': [],
                'total_found': 0,
                'authors': [],
                'themes': [],
                'llm_generated': False
            }
        
        # Extract information
        authors = list(set([r['author'] for r in results if r['author'] != 'Unknown']))
        tags = []
        for r in results:
            if r['tags']:
                tags.extend(str(r['tags']).split(','))
        
        unique_tags = list(set([tag.strip() for tag in tags if tag.strip()]))[:5]
        
        # Generate summary
        summary = f"Found {len(results)} relevant quotes"
        if authors:
            summary += f" from authors including {', '.join(authors[:3])}"
        if unique_tags:
            summary += f" related to themes like {', '.join(unique_tags[:3])}"
        
        return {
            'query': query,
            'summary': summary,
            'quotes': results,
            'total_found': len(results),
            'authors': authors,
            'themes': unique_tags,
            'llm_generated': False
        }
    
    def generate_response_with_llm(self, query, results):
        """Generate response using LLM"""
        if not self.llm or not results:
            return self.generate_response_simple(query, results)
        
        try:
            # Get basic info
            authors = list(set([r['author'] for r in results if r['author'] != 'Unknown']))
            
            # Prepare context for LLM (keep it concise)
            context = f"Query: {query}\n\nTop quotes:\n"
            for i, quote in enumerate(results[:2], 1):  # Only use top 2 quotes
                quote_text = quote['quote'][:100] + "..." if len(quote['quote']) > 100 else quote['quote']
                context += f"{i}. \"{quote_text}\" - {quote['author']}\n"
            
            # Create a focused prompt
            prompt = f"""Analyze these quotes about "{query}":

{context}

Key insights:"""
            
            # Generate response with LLM
            print(" AI is analyzing the quotes...")
            response = self.llm(
                prompt,
                max_new_tokens=60,
                num_return_sequences=1,
                pad_token_id=self.llm.tokenizer.eos_token_id,
                truncation=True
            )
            
            # Extract generated text
            generated_text = response[0]['generated_text']
            llm_analysis = generated_text[len(prompt):].strip()
            
            # Clean up the response
            if llm_analysis:
                # Take first complete sentence or limit length
                sentences = llm_analysis.split('.')
                if len(sentences) > 1 and len(sentences[0]) > 10:
                    llm_analysis = sentences[0] + '.'
                else:
                    llm_analysis = llm_analysis[:150] + "..." if len(llm_analysis) > 150 else llm_analysis
            else:
                llm_analysis = "These quotes provide valuable insights related to your query."
            
            # Extract tags for themes
            tags = []
            for r in results:
                if r['tags']:
                    tags.extend(str(r['tags']).split(','))
            unique_tags = list(set([tag.strip() for tag in tags if tag.strip()]))[:5]
            
            return {
                'query': query,
                'summary': f" AI Analysis: {llm_analysis}",
                'quotes': results,
                'total_found': len(results),
                'authors': authors,
                'themes': unique_tags,
                'llm_generated': True
            }
            
        except Exception as e:
            print(f"⚠️ AI analysis error: {e}")
            return self.generate_response_simple(query, results)
    
    def search_and_generate(self, query, top_k=5, use_llm=False):
        """Complete RAG pipeline: retrieve and generate"""
        start_time = time.time()
        
        # Retrieve quotes
        results = self.search_quotes(query, top_k)
        
        # Generate response
        if use_llm and self.llm:
            response = self.generate_response_with_llm(query, results)
        else:
            response = self.generate_response_simple(query, results)
        
        end_time = time.time()
        response['search_time'] = end_time - start_time
        
        return response
    
    def display_results(self, response, show_scores=True, show_json=False):
        """Display search results in a formatted way"""
        query = response['query']
        
        # Header
        display(HTML(f"""
        <div style="border-left: 4px solid #2196F3; padding-left: 20px; margin: 20px 0;">
            <h2>📚 Search Results for: "{query}"</h2>
        </div>
        """))
        
        # Summary
        if response.get('llm_generated'):
            display(HTML(f"""
            <div style="background-color: #e3f2fd; padding: 15px; border-radius: 8px; margin: 10px 0;">
                <h3> AI Analysis</h3>
                <p>{response['summary']}</p>
            </div>
            """))
        else:
            display(HTML(f"""
            <div style="background-color: #f5f5f5; padding: 15px; border-radius: 8px; margin: 10px 0;">
                <h3>📊 Search Summary</h3>
                <p>{response['summary']}</p>
            </div>
            """))
        
        # Metrics
        display(HTML(f"""
        <div style="display: flex; gap: 20px; margin: 20px 0;">
            <div style="background-color: #e8f5e8; padding: 10px; border-radius: 5px; text-align: center;">
                <strong>Total Found</strong><br>{response['total_found']}
            </div>
            <div style="background-color: #fff3e0; padding: 10px; border-radius: 5px; text-align: center;">
                <strong>Authors</strong><br>{len(response['authors'])}
            </div>
            <div style="background-color: #f3e5f5; padding: 10px; border-radius: 5px; text-align: center;">
                <strong>Themes</strong><br>{len(response['themes'])}
            </div>
            <div style="background-color: #e0f2f1; padding: 10px; border-radius: 5px; text-align: center;">
                <strong>Search Time</strong><br>{response.get('search_time', 0):.2f}s
            </div>
        </div>
        """))
        
        # Display quotes
        if response['quotes']:
            display(HTML("<h3>📚 Found Quotes</h3>"))
            
            for i, quote_data in enumerate(response['quotes']):
                score_display = f" (Relevance: {quote_data['similarity_score']:.4f})" if show_scores else ""
                
                display(HTML(f"""
                <div style="border: 1px solid #ddd; padding: 15px; margin: 10px 0; border-radius: 8px; background-color: #fafafa;">
                    <h4>📝 Quote {i+1}: {quote_data['author']}{score_display}</h4>
                    <blockquote style="font-style: italic; margin: 10px 0; padding: 10px; border-left: 3px solid #2196F3;">
                        "{quote_data['quote']}"
                    </blockquote>
                    <p><strong>👤 Author:</strong> {quote_data['author']}</p>
                    {f"<p><strong>🏷️ Tags:</strong> {quote_data['tags']}</p>" if quote_data['tags'] else ""}
                </div>
                """))
            
            # Additional insights
            if response['authors'] or response['themes']:
                display(HTML("<h3>📊 Insights</h3>"))
                
                insights_html = "<div style='display: flex; gap: 30px;'>"
                
                if response['authors']:
                    authors_list = "<br>".join([f"• {author}" for author in response['authors'][:5]])
                    insights_html += f"""
                    <div style="flex: 1;">
                        <h4>Top Authors:</h4>
                        <div>{authors_list}</div>
                    </div>
                    """
                
                if response['themes']:
                    themes_list = "<br>".join([f"• {theme}" for theme in response['themes'][:5]])
                    insights_html += f"""
                    <div style="flex: 1;">
                        <h4>Related Themes:</h4>
                        <div>{themes_list}</div>
                    </div>
                    """
                
                insights_html += "</div>"
                display(HTML(insights_html))
        
        else:
            display(HTML("""
            <div style="background-color: #fff3cd; padding: 15px; border-radius: 8px; margin: 10px 0;">
                <p><strong>⚠️ No quotes found for your search query.</strong> Try different keywords!</p>
            </div>
            """))
        
        # JSON Response (optional)
        if show_json:
            display(HTML("<h3>🔧 JSON Response</h3>"))
            display(response)

# Initialize the RAG pipeline
print("🚀 Initializing Quotes RAG Application for Jupyter Notebook")
print("=" * 60)

rag = JupyterRAGPipeline()

# Example usage functions
def setup():
    """Setup function to load all models and data"""
    print("📦 Setting up Quotes RAG Application...")
    
    # Load main model and index
    if not rag.load_model_and_index():
        return False
    
    # Try to load LLM
    if LLM_AVAILABLE:
        rag.load_llm_model()
    
    print("\n✅ Setup complete! Ready to search quotes.")
    print("\n📝 Example usage:")
    print("  search('quotes about hope')")
    print("  search_with_ai('inspirational quotes about success')")
    print("  explore_examples()")
    
    return True

def search(query, top_k=5, show_scores=True, show_json=False):
    """Search for quotes without AI analysis"""
    if not rag.loaded:
        print("❌ Please run setup() first to load the models.")
        return None
    
    response = rag.search_and_generate(query, top_k, use_llm=False)
    rag.display_results(response, show_scores, show_json)
    return response

def search_with_ai(query, top_k=5, show_scores=True, show_json=False):
    """Search for quotes with AI analysis"""
    if not rag.loaded:
        print("❌ Please run setup() first to load the models.")
        return None
    
    if not rag.llm:
        print("⚠️ AI model not available. Using standard search instead.")
        return search(query, top_k, show_scores, show_json)
    
    response = rag.search_and_generate(query, top_k, use_llm=True)
    rag.display_results(response, show_scores, show_json)
    return response

def explore_examples():
    """Show example queries with results"""
    examples = [
        "quotes about hope by Oscar Wilde",
        "inspirational quotes about success",
        "quotes about love by Shakespeare",
        "motivational quotes for difficult times",
        "funny quotes about life"
    ]
    
    display(HTML("<h2>📝 Example Queries</h2>"))
    
    for i, example in enumerate(examples, 1):
        display(HTML(f"""
        <div style="background-color: #f0f8ff; padding: 10px; margin: 5px 0; border-radius: 5px;">
            <strong>Example {i}:</strong> <code>search('{example}')</code>
        </div>
        """))
    
    display(HTML("""
    <div style="background-color: #fff9c4; padding: 15px; margin: 15px 0; border-radius: 8px;">
        <h4>💡 Tips:</h4>
        <ul>
            <li>Use <code>search(query)</code> for standard search</li>
            <li>Use <code>search_with_ai(query)</code> for AI-enhanced analysis</li>
            <li>Adjust <code>top_k</code> parameter to get more results</li>
            <li>Set <code>show_json=True</code> to see the raw response data</li>
        </ul>
    </div>
    """))

def info():
    """Display information about the application"""
    display(HTML("""
    <div style="border: 2px solid #2196F3; padding: 20px; border-radius: 10px; margin: 20px 0;">
        <h2>📚 Quotes RAG Application - Jupyter Notebook Version</h2>
        <p><strong>Retrieval Augmented Generation for English Quotes with AI Enhancement</strong></p>
        
        <h3>🚀 Getting Started:</h3>
        <ol>
            <li>Run <code>setup()</code> to load models and data</li>
            <li>Use <code>search('your query')</code> to find quotes</li>
            <li>Use <code>search_with_ai('your query')</code> for AI analysis</li>
            <li>Try <code>explore_examples()</code> for sample queries</li>
        </ol>
        
        <h3>🔧 Available Functions:</h3>
        <ul>
            <li><code>setup()</code> - Initialize the application</li>
            <li><code>search(query, top_k=5)</code> - Standard search</li>
            <li><code>search_with_ai(query, top_k=5)</code> - AI-enhanced search</li>
            <li><code>explore_examples()</code> - Show example queries</li>
            <li><code>info()</code> - Display this information</li>
        </ul>
        
        <h3>🤖 AI Features:</h3>
        <ul>
            <li>Uses DistilGPT-2 for quote analysis</li>
            <li>Provides contextual insights</li>
            <li>Identifies themes and patterns</li>
            <li>Completely free and runs locally</li>
        </ul>
        
        <h3>📦 Requirements:</h3>
        <p>For AI features: <code>pip install transformers torch accelerate</code></p>
    </div>
    """))

# Display welcome message
print("🎉 Quotes RAG Application loaded successfully!")
print("📖 Run info() for detailed information")
print("🚀 Run setup() to get started")
print("=" * 60)

✅ LLM dependencies available
🚀 Initializing Quotes RAG Application for Jupyter Notebook
🎉 Quotes RAG Application loaded successfully!
📖 Run info() for detailed information
🚀 Run setup() to get started


In [34]:
# Test Complete Pipeline

def test_complete_pipeline():
    """Test the complete RAG pipeline with various queries"""
    
    test_queries = [
        "quotes about hope by Oscar Wilde",
        "inspirational quotes about success by famous authors",
        "funny quotes about life and humor",
        "quotes about love and relationships by Shakespeare",
        "motivational quotes for overcoming challenges",
        "quotes about wisdom and learning",
        "quotes by women authors about strength",
        "quotes about friendship and loyalty",
        "quotes about time and life philosophy",
        "quotes about dreams and aspirations"
    ]
    
    print("🧪 TESTING COMPLETE RAG PIPELINE")
    print("=" * 60)
    
    results_summary = []
    
    for i, query in enumerate(test_queries, 1):
        print(f"\n[TEST {i}] Query: {query}")
        print("-" * 40)
        
        try:
            # Test retrieval
            retrieved = vector_index.search(query, top_k=3)
            
            # Test RAG response
            rag_response = rag_pipeline.search_and_generate(query, top_k=3)
            
            print(f"✅ Retrieved {len(retrieved)} quotes")
            print(f"📝 Summary: {rag_response['summary']}")
            
            if retrieved:
                best_match = retrieved[0]
                print(f"🎯 Best match: \"{best_match['quote'][:100]}...\" by {best_match['author']}")
                print(f"📊 Score: {best_match['similarity_score']:.4f}")
            
            results_summary.append({
                'query': query,
                'num_results': len(retrieved),
                'best_score': retrieved[0]['similarity_score'] if retrieved else 0,
                'has_results': len(retrieved) > 0
            })
            
        except Exception as e:
            print(f"❌ Error: {e}")
            results_summary.append({
                'query': query,
                'num_results': 0,
                'best_score': 0,
                'has_results': False,
                'error': str(e)
            })
    
    # Summary statistics
    print(f"\n{'='*60}")
    print("📊 PIPELINE PERFORMANCE SUMMARY")
    print(f"{'='*60}")
    
    successful_queries = sum(1 for r in results_summary if r['has_results'])
    avg_results = np.mean([r['num_results'] for r in results_summary])
    avg_score = np.mean([r['best_score'] for r in results_summary if r['best_score'] > 0])
    
    print(f"✅ Successful queries: {successful_queries}/{len(test_queries)} ({successful_queries/len(test_queries)*100:.1f}%)")
    print(f"📈 Average results per query: {avg_results:.1f}")
    print(f"🎯 Average best match score: {avg_score:.4f}")
    
    return results_summary

# Run complete pipeline test
test_results = test_complete_pipeline()

print(f"\n{'='*60}")
print("🎉 RAG QUOTES APPLICATION SETUP COMPLETE!")
print(f"{'='*60}")
print("\n📁 Files created:")
print("• fine_tuned_quotes_model/ - Fine-tuned sentence transformer")
print("• quotes_index.faiss - FAISS vector index")  
print("• quotes_data.pkl - Processed quotes dataset")
print("• streamlit_app.py - Streamlit web application")

print(f"\n🚀 Next steps:")
print("1. Run 'streamlit run streamlit_app.py' to start the web app")
print("2. Test different query types in the interface")
print("3. Customize the model or add more sophisticated LLM integration")
print("4. Deploy to cloud platforms like Streamlit Cloud, Heroku, or AWS")

print(f"\n📊 System ready with {len(df_processed)} quotes indexed and searchable!")

🧪 TESTING COMPLETE RAG PIPELINE

[TEST 1] Query: quotes about hope by Oscar Wilde
----------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Retrieved 3 quotes
📝 Summary: 🤖 AI Analysis: 1.
🎯 Best match: "“Hearts are made to be broken.”..." by Oscar Wilde,
📊 Score: 0.9931

[TEST 2] Query: inspirational quotes about success by famous authors
----------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Retrieved 3 quotes
📝 Summary: 🤖 AI Analysis: 1.
🎯 Best match: "“Success is stumbling from failure to failure with no loss of enthusiasm.”..." by Winston S. Churchill
📊 Score: 0.9898

[TEST 3] Query: funny quotes about life and humor
----------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Retrieved 3 quotes
📝 Summary: 🤖 AI Analysis: 1.
🎯 Best match: "“It's hard to enjoy practical jokes when your whole life feels like one.”..." by Rick Riordan,
📊 Score: 0.9877

[TEST 4] Query: quotes about love and relationships by Shakespeare
----------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Retrieved 3 quotes
📝 Summary: 🤖 AI Analysis: 1.
🎯 Best match: "“Words are easy, like the wind; Faithful friends are hard to find.”..." by William Shakespeare,
📊 Score: 0.9912

[TEST 5] Query: motivational quotes for overcoming challenges
----------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Retrieved 3 quotes
📝 Summary: 🤖 AI Analysis: 1.
🎯 Best match: "“What you're supposed to do when you don't like a thing is change it. If you can't change it, change..." by Maya Angelou,
📊 Score: 0.9794

[TEST 6] Query: quotes about wisdom and learning
----------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Retrieved 3 quotes
📝 Summary: 🤖 AI Analysis: 1.
🎯 Best match: "“Wisdom comes from experience. Experience is often a result of lack of wisdom.”..." by Terry Pratchett
📊 Score: 0.9894

[TEST 7] Query: quotes by women authors about strength
----------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Retrieved 3 quotes
📝 Summary: 🤖 AI Analysis: 1.
🎯 Best match: "“A brave man acknowledges the strength of others.”..." by Veronica Roth,
📊 Score: 0.9881

[TEST 8] Query: quotes about friendship and loyalty
----------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Retrieved 3 quotes
📝 Summary: 🤖 AI Analysis: 1.
🎯 Best match: "“There is no friend as loyal as a book.”..." by Ernest Hemingway
📊 Score: 0.9901

[TEST 9] Query: quotes about time and life philosophy
----------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Retrieved 3 quotes
📝 Summary: 🤖 AI Analysis: 1.
🎯 Best match: "“You have power over your mind - not outside events. Realize this, and you will find strength.”..." by Marcus Aurelius,
📊 Score: 0.9904

[TEST 10] Query: quotes about dreams and aspirations
----------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Retrieved 3 quotes
📝 Summary: 🤖 AI Analysis: 1.
🎯 Best match: "“Dare to live the life you have dreamed for yourself. Go forward and make your dreams come true.”..." by Ralph Waldo Emerson
📊 Score: 0.9875

📊 PIPELINE PERFORMANCE SUMMARY
✅ Successful queries: 10/10 (100.0%)
📈 Average results per query: 3.0
🎯 Average best match score: 0.9887

🎉 RAG QUOTES APPLICATION SETUP COMPLETE!

📁 Files created:
• fine_tuned_quotes_model/ - Fine-tuned sentence transformer
• quotes_index.faiss - FAISS vector index
• quotes_data.pkl - Processed quotes dataset
• streamlit_app.py - Streamlit web application

🚀 Next steps:
1. Run 'streamlit run streamlit_app.py' to start the web app
2. Test different query types in the interface
3. Customize the model or add more sophisticated LLM integration
4. Deploy to cloud platforms like Streamlit Cloud, Heroku, or AWS

📊 System ready with 2507 quotes indexed and searchable!


In [2]:
!streamlit run streamlit_app.py

^C
