In [None]:
print('Setup complete.')

# Enhanced RAG with AskSage and GPT-5-Mini

**Focus**: Advanced Retrieval-Augmented Generation using AskSageClient and GPT-5-Mini

This notebook demonstrates how to efficiently build a RAG system using AskSageClient with GPT-5-Mini model and NVIDIA's NV-Embed-v2 for embeddings.

## Learning Objectives
- Use AskSageClient with GPT-5-Mini for LLM interactions
- Implement NVIDIA NV-Embed-v2 for high-quality embeddings
- Build an efficient RAG pipeline with semantic search
- Handle document indexing and retrieval with embeddings
- Optimize API usage and manage costs effectively

In [None]:
# Install required packages
!pip install requests
!pip install asksageclient
!pip install transformers
!pip install torch
!pip install sentence-transformers
!pip install numpy
!pip install scikit-learn
!pip install faiss-cpu
!pip install pandas

print("✅ All packages installed successfully!")

In [None]:
import json
import requests
from asksageclient import AskSageClient
import os
import pathlib
import re
import numpy as np
import pandas as pd
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass
from sentence_transformers import SentenceTransformer
import faiss
from sklearn.metrics.pairwise import cosine_similarity

print("✅ All modules imported successfully!")

In [None]:
# Function to load credentials from a JSON file
def load_credentials(filename):
    try:
        with open(filename) as file:
            return json.load(file)
    except FileNotFoundError:
        raise FileNotFoundError("The credentials file was not found.")
    except json.JSONDecodeError:
        raise ValueError("Failed to decode JSON from the credentials file.")

# Load the credentials
credentials = load_credentials('../../credentials.json')

# Extract the API key and email from the credentials
api_key = credentials['credentials']['api_key']
email = credentials['credentials']['Ask_sage_user_info']['username']

# Initialize AskSageClient
ask_sage_client = AskSageClient(email, api_key)

print("✅ AskSage client initialized successfully!")
print(f"📧 Connected as: {email}")

In [None]:
# Verify GPT-5-Mini model availability
print("🔍 Checking available models...")
try:
    models_response = ask_sage_client.get_models()
    available_models = models_response.get('response', [])
    
    # Look for GPT-5-Mini variants
    gpt5_models = [model for model in available_models if 'gpt-o3' in model.lower() or 'gpt-5' in model.lower()]
    
    if gpt5_models:
        selected_model = gpt5_models[0]  # Use the first available GPT-5/O3 model
        print(f"✅ Found GPT-5/O3 model: {selected_model}")
    else:
        # Fallback to GPT-4o-mini if GPT-5 not available
        selected_model = 'gpt-4o-mini'
        print(f"⚠️ GPT-5-Mini not found, using fallback: {selected_model}")
        
    print(f"📋 Total available models: {len(available_models)}")
    
except Exception as e:
    print(f"❌ Error checking models: {e}")
    selected_model = 'gpt-4o-mini'  # Safe fallback
    
print(f"🎯 Selected model: {selected_model}")

In [None]:
# Initialize NVIDIA NV-Embed-v2 embedding model
print("🔄 Loading NVIDIA NV-Embed-v2 model...")
embedding_model = SentenceTransformer('nvidia/NV-Embed-v2', trust_remote_code=True)
print("✅ NVIDIA NV-Embed-v2 model loaded successfully!")

def get_embeddings(texts: List[str]) -> np.ndarray:
    """Generate embeddings using NVIDIA NV-Embed-v2 model"""
    if isinstance(texts, str):
        texts = [texts]
    
    embeddings = embedding_model.encode(texts, normalize_embeddings=True)
    return embeddings

# Test the embedding model
test_text = "This is a test sentence for embedding generation with GPT-5-Mini."
test_embedding = get_embeddings([test_text])
print(f"✅ Test embedding shape: {test_embedding.shape}")
print(f"✅ Embedding dimension: {test_embedding.shape[1]}")
print(f"✅ Embedding model working correctly!")

## Enhanced RAG System with GPT-5-Mini

### Key Components
- **Document Indexing**: Process documents and create embeddings
- **Vector Storage**: Use FAISS for efficient similarity search
- **Retrieval**: Find most relevant chunks based on query embeddings
- **Generation**: Use GPT-5-Mini via AskSage for contextual responses

### Advantages of GPT-5-Mini
- **Superior Performance**: Advanced reasoning capabilities
- **Cost Efficiency**: Optimized for production use
- **Better Context Understanding**: Enhanced contextual awareness
- **Improved Accuracy**: More reliable and consistent outputs

In [None]:
@dataclass
class DocumentChunk:
    """Represents a document chunk with metadata"""
    content: str
    source: str
    chunk_id: int
    embedding: Optional[np.ndarray] = None
    metadata: Optional[Dict] = None

class EnhancedRAGSystem:
    """Enhanced RAG system using AskSage GPT-5-Mini and NV-Embed-v2"""
    
    def __init__(self, ask_sage_client, embedding_model, model_name: str, chunk_size: int = 1000, chunk_overlap: int = 200):
        self.client = ask_sage_client
        self.embedding_model = embedding_model
        self.model_name = model_name
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.document_chunks: List[DocumentChunk] = []
        self.vector_index = None
        self.embedding_dimension = None
        
    def chunk_text(self, text: str, source: str) -> List[DocumentChunk]:
        """Split text into overlapping chunks"""
        chunks = []
        start = 0
        chunk_id = 0
        
        while start < len(text):
            end = start + self.chunk_size
            chunk_content = text[start:end]
            
            # Try to break at sentence boundary
            if end < len(text):
                last_sentence = chunk_content.rfind('.')
                if last_sentence > self.chunk_size * 0.7:  # Don't make chunks too small
                    chunk_content = chunk_content[:last_sentence + 1]
                    end = start + last_sentence + 1
            
            chunk = DocumentChunk(
                content=chunk_content.strip(),
                source=source,
                chunk_id=chunk_id,
                metadata={'start': start, 'end': end}
            )
            chunks.append(chunk)
            
            start = end - self.chunk_overlap
            chunk_id += 1
            
        return chunks
    
    def add_document(self, text: str, source: str):
        """Add a document to the RAG system"""
        print(f"📄 Processing document: {source}")
        
        # Create chunks
        chunks = self.chunk_text(text, source)
        
        # Generate embeddings for chunks
        chunk_texts = [chunk.content for chunk in chunks]
        embeddings = get_embeddings(chunk_texts)
        
        # Assign embeddings to chunks
        for i, chunk in enumerate(chunks):
            chunk.embedding = embeddings[i]
            
        self.document_chunks.extend(chunks)
        print(f"✅ Added {len(chunks)} chunks from {source}")
        
    def build_vector_index(self):
        """Build FAISS vector index for efficient similarity search"""
        if not self.document_chunks:
            raise ValueError("No documents added to the system")
            
        print("🔨 Building vector index...")
        
        # Get all embeddings
        embeddings = np.array([chunk.embedding for chunk in self.document_chunks])
        self.embedding_dimension = embeddings.shape[1]
        
        # Create FAISS index
        self.vector_index = faiss.IndexFlatIP(self.embedding_dimension)  # Inner product for normalized vectors
        self.vector_index.add(embeddings.astype('float32'))
        
        print(f"✅ Vector index built with {len(self.document_chunks)} chunks")
        print(f"📏 Embedding dimension: {self.embedding_dimension}")
        
    def retrieve_relevant_chunks(self, query: str, top_k: int = 5) -> List[Tuple[DocumentChunk, float]]:
        """Retrieve most relevant chunks for a query"""
        if self.vector_index is None:
            raise ValueError("Vector index not built. Call build_vector_index() first.")
            
        # Generate query embedding
        query_embedding = get_embeddings([query])[0]
        
        # Search for similar chunks
        scores, indices = self.vector_index.search(
            query_embedding.reshape(1, -1).astype('float32'), 
            top_k
        )
        
        # Return chunks with scores
        results = []
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            if idx < len(self.document_chunks):  # Valid index
                chunk = self.document_chunks[idx]
                results.append((chunk, float(score)))
                
        return results
    
    def generate_response(self, query: str, context_chunks: List[DocumentChunk], max_context_length: int = 4000) -> str:
        """Generate response using GPT-5-Mini with retrieved context"""
        # Prepare context from retrieved chunks
        context_parts = []
        current_length = 0
        
        for chunk in context_chunks:
            chunk_text = f"Source: {chunk.source}\n{chunk.content}\n\n"
            if current_length + len(chunk_text) > max_context_length:
                break
            context_parts.append(chunk_text)
            current_length += len(chunk_text)
            
        context = "".join(context_parts)
        
        # Create prompt for GPT-5-Mini
        prompt = f"""You are an expert assistant with access to relevant documentation. Use the provided context to answer the user's question accurately and comprehensively.

Context:
{context}

Question: {query}

Instructions:
1. Base your answer primarily on the provided context
2. If the context doesn't contain enough information, clearly state this
3. Cite sources when referencing specific information
4. Be concise but comprehensive
5. If you're unsure about something, express appropriate uncertainty

Answer:"""
        
        try:
            # Use AskSage to query GPT-5-Mini
            response = self.client.query(
                question=prompt,
                model=self.model_name,
                temperature=0.3,  # Lower temperature for more consistent responses
                max_tokens=1500
            )
            
            # Extract the response text
            if isinstance(response, dict) and 'response' in response:
                return response['response']
            else:
                return str(response)
                
        except Exception as e:
            return f"Error generating response: {str(e)}"
    
    def query(self, question: str, top_k: int = 5) -> Dict:
        """Complete RAG pipeline: retrieve and generate"""
        print(f"🔍 Processing query: {question[:100]}{'...' if len(question) > 100 else ''}")
        
        # Retrieve relevant chunks
        relevant_chunks = self.retrieve_relevant_chunks(question, top_k)
        
        print(f"📚 Retrieved {len(relevant_chunks)} relevant chunks")
        for i, (chunk, score) in enumerate(relevant_chunks):
            print(f"  {i+1}. {chunk.source} (score: {score:.3f})")
        
        # Generate response
        chunks_only = [chunk for chunk, score in relevant_chunks]
        response = self.generate_response(question, chunks_only)
        
        return {
            'question': question,
            'answer': response,
            'retrieved_chunks': relevant_chunks,
            'model_used': self.model_name
        }
    
    def get_system_stats(self) -> Dict:
        """Get system statistics"""
        return {
            'total_chunks': len(self.document_chunks),
            'embedding_dimension': self.embedding_dimension,
            'chunk_size': self.chunk_size,
            'chunk_overlap': self.chunk_overlap,
            'model_name': self.model_name,
            'sources': list(set(chunk.source for chunk in self.document_chunks))
        }

print("✅ EnhancedRAGSystem class defined!")

## Demo: RAG System with GPT-5-Mini

Let's demonstrate the enhanced RAG system using real documents and GPT-5-Mini for generation.

In [None]:
# Initialize the RAG system
print("🚀 Initializing Enhanced RAG System with GPT-5-Mini")
rag_system = EnhancedRAGSystem(
    ask_sage_client=ask_sage_client,
    embedding_model=embedding_model,
    model_name=selected_model,
    chunk_size=800,
    chunk_overlap=100
)

print(f"✅ RAG system initialized with model: {selected_model}")

In [None]:
# Sample documents for demonstration
sample_documents = {
    "AI_Introduction.txt": """
Artificial Intelligence (AI) is a branch of computer science that focuses on creating systems capable of performing tasks that typically require human intelligence. These tasks include learning, reasoning, problem-solving, perception, and language understanding.

Machine Learning is a subset of AI that enables systems to automatically learn and improve from experience without being explicitly programmed. Deep Learning, a subset of Machine Learning, uses neural networks with multiple layers to model and understand complex patterns in data.

Natural Language Processing (NLP) is another important area of AI that deals with the interaction between computers and human language. It enables machines to read, understand, and generate human language in a valuable way.

Large Language Models (LLMs) like GPT-5-Mini represent the latest advancement in NLP, capable of understanding context, generating coherent text, and performing various language tasks with remarkable accuracy.
""",
    
    "RAG_Concepts.txt": """
Retrieval-Augmented Generation (RAG) is a powerful technique that combines the strengths of retrieval-based and generation-based approaches for natural language processing tasks. RAG enhances language models by providing them with access to external knowledge through a retrieval mechanism.

The RAG process consists of two main components: a retriever and a generator. The retriever searches through a knowledge base or document collection to find relevant information based on the input query. The generator, typically a language model like GPT-5-Mini, then uses both the original query and the retrieved information to generate a comprehensive response.

Key advantages of RAG include:
1. Access to up-to-date information not present in the model's training data
2. Improved factual accuracy by grounding responses in retrieved documents
3. Transparency through the ability to trace answers back to source documents
4. Reduced hallucination compared to pure generative approaches

Vector embeddings play a crucial role in RAG systems by enabling semantic similarity search between queries and documents.
""",
    
    "GPT5_Features.txt": """
GPT-5-Mini represents a significant advancement in language model technology, offering improved performance while maintaining efficiency. Key features include:

Enhanced Reasoning: GPT-5-Mini demonstrates superior logical reasoning capabilities, making it excellent for complex problem-solving tasks and multi-step reasoning.

Better Context Understanding: The model shows improved ability to maintain context over longer conversations and documents, enabling more coherent and relevant responses.

Reduced Hallucination: Advanced training techniques have significantly reduced the model's tendency to generate factually incorrect information.

Efficiency Optimizations: Despite its advanced capabilities, GPT-5-Mini is optimized for faster inference and lower computational costs compared to larger models.

Improved Safety: Enhanced safety measures and alignment techniques make GPT-5-Mini more reliable for production use cases.

Multimodal Capabilities: The model can process and understand various types of input beyond just text, including structured data and code.
"""
}

# Add documents to the RAG system
print("📚 Adding sample documents to RAG system...")
for source, content in sample_documents.items():
    rag_system.add_document(content.strip(), source)

print(f"\n✅ Added {len(sample_documents)} documents")

In [None]:
# Build the vector index
print("🔨 Building vector index for efficient retrieval...")
rag_system.build_vector_index()

# Display system statistics
stats = rag_system.get_system_stats()
print("\n📊 RAG System Statistics:")
print(f"  Total chunks: {stats['total_chunks']}")
print(f"  Embedding dimension: {stats['embedding_dimension']}")
print(f"  Chunk size: {stats['chunk_size']}")
print(f"  Model: {stats['model_name']}")
print(f"  Sources: {', '.join(stats['sources'])}")

In [None]:
# Test queries with the RAG system
test_queries = [
    "What are the key advantages of RAG systems?",
    "How does GPT-5-Mini improve upon previous language models?",
    "What is the relationship between AI, Machine Learning, and Deep Learning?",
    "How do vector embeddings work in RAG systems?"
]

print("🧪 Testing RAG system with sample queries...\n")

for i, query in enumerate(test_queries, 1):
    print(f"\n{'='*80}")
    print(f"Query {i}: {query}")
    print(f"{'='*80}")
    
    try:
        result = rag_system.query(query, top_k=3)
        
        print(f"\n🤖 GPT-5-Mini Response:")
        print(result['answer'])
        
        print(f"\n📋 Retrieved Sources:")
        for j, (chunk, score) in enumerate(result['retrieved_chunks'], 1):
            print(f"  {j}. {chunk.source} (similarity: {score:.3f})")
            print(f"     Preview: {chunk.content[:100]}...")
            
    except Exception as e:
        print(f"❌ Error processing query: {e}")
    
    print("\n" + "-"*50)

## Interactive RAG Query Interface

Use this cell to ask your own questions to the RAG system powered by GPT-5-Mini.

In [None]:
# Interactive query interface
def ask_rag_system(question: str, detailed: bool = True):
    """Ask a question to the RAG system"""
    try:
        result = rag_system.query(question, top_k=5)
        
        print(f"🤖 GPT-5-Mini Answer:")
        print(f"{result['answer']}")
        
        if detailed:
            print(f"\n📚 Sources Used:")
            for i, (chunk, score) in enumerate(result['retrieved_chunks'], 1):
                print(f"  {i}. {chunk.source} (relevance: {score:.3f})")
                
            print(f"\n⚙️ Model: {result['model_used']}")
        
        return result
        
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

# Example usage - modify the question below
your_question = "Explain how GPT-5-Mini's enhanced reasoning capabilities benefit RAG systems."

print(f"❓ Question: {your_question}")
print("\n" + "="*80)
result = ask_rag_system(your_question)

# Try asking your own questions by modifying the 'your_question' variable above!

## Performance Analysis and Best Practices

### GPT-5-Mini Integration Benefits
- **Superior Context Understanding**: Better comprehension of retrieved documents
- **Enhanced Reasoning**: More logical and coherent responses
- **Reduced Hallucination**: More accurate and fact-based answers
- **Efficient Processing**: Optimized performance for production use

### RAG System Optimization Tips
1. **Chunk Size**: Balance between context and specificity (800-1200 tokens)
2. **Overlap**: Ensure continuity between chunks (10-20% overlap)
3. **Embedding Quality**: Use high-quality models like NV-Embed-v2
4. **Retrieval Strategy**: Experiment with different similarity thresholds
5. **Prompt Engineering**: Craft clear instructions for the generation model

### Production Considerations
- **Caching**: Cache embeddings and frequent queries
- **Scaling**: Use vector databases for large document collections
- **Monitoring**: Track query performance and answer quality
- **Updates**: Regularly refresh the document index

In [None]:
# Performance analysis
print("📊 RAG System Performance Analysis")
print("="*50)

# System statistics
stats = rag_system.get_system_stats()
print(f"📈 Total document chunks: {stats['total_chunks']}")
print(f"🔢 Embedding dimensions: {stats['embedding_dimension']}")
print(f"📏 Average chunk size: ~{stats['chunk_size']} characters")
print(f"🤖 Model used: {stats['model_name']}")

# Calculate some metrics
total_content_length = sum(len(chunk.content) for chunk in rag_system.document_chunks)
avg_chunk_length = total_content_length / len(rag_system.document_chunks) if rag_system.document_chunks else 0

print(f"\n📊 Content Analysis:")
print(f"  Total content length: {total_content_length:,} characters")
print(f"  Average chunk length: {avg_chunk_length:.0f} characters")
print(f"  Sources processed: {len(stats['sources'])}")

print(f"\n✅ RAG system ready for production use with GPT-5-Mini!")
print(f"💡 Try asking complex questions that require reasoning across multiple sources.")