# Pinecone vector DB

### 1. Import libraries

In [17]:
import os
from dotenv import load_dotenv

# Import DB libraries
from pinecone import Pinecone,ServerlessSpec
from langchain_groq import ChatGroq
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

# Import LangChain libraries
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader, PyPDFLoader, DirectoryLoader
from langchain_core.documents import Document

load_dotenv()

True

### 2. Import ENV variables

In [2]:
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
INDEX_NAME = os.getenv("INDEX_NAME")
DIMENSION = int(os.getenv("DIMENSION", "1024"))
CLOUD = os.getenv("CLOUD", "aws")
REGION = os.getenv("REGION", "us-east-1")

### 3. Initialize Pinecone Vector DB

In [3]:
# Initialize Pinecone
pc = Pinecone(
    api_key=os.getenv("PINECONE_API_KEY"),
)
if INDEX_NAME not in pc.list_indexes().names():
    print(f"Creating Pinecone index: {INDEX_NAME}")
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIMENSION,
        metric="cosine",
        spec=ServerlessSpec(
            cloud=CLOUD,
            region=REGION,
        )
    )
    # Check if the index is ready
    while not pc.describe_index(INDEX_NAME).status['ready']:
        time.sleep(5)
    print(f"Pinecone index '{INDEX_NAME}' is ready.")
else:
    print(f"Pinecone index '{INDEX_NAME}' already exists.")

Pinecone index 'rag-udemy' already exists.


### 4. Initialize Embeddings

In [22]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=1024)
embeddings 

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x17fdaacd0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x17fdaabd0>, model='text-embedding-3-small', dimensions=1024, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

### 5. Load and Split documents

In [5]:
# Create sample data
news_articles = [
    {
        "id": "news_001",
        "title": "OpenAI Releases GPT-4 Turbo with Enhanced Capabilities",
        "category": "news",
        "source": "TechCrunch",
        "date": "2024-01-15",
        "content": """
OpenAI has announced the release of GPT-4 Turbo, an improved version of their flagship 
language model. The new model features a larger context window of 128,000 tokens, 
significantly improved performance on complex reasoning tasks, and reduced pricing. 
GPT-4 Turbo can process the equivalent of approximately 300 pages of text in a single 
prompt, making it ideal for analyzing lengthy documents, codebases, and research papers.

The model also includes improved instruction following and more consistent output formatting. 
OpenAI has reported that GPT-4 Turbo is 40% more accurate on factual questions compared to 
its predecessor and shows reduced hallucination rates. The API pricing has been reduced by 
50% for input tokens and 33% for output tokens, making it more accessible for developers 
and businesses.

Key features include enhanced multimodal capabilities, better performance on mathematical 
problems, and improved code generation. The model is now available through the OpenAI API 
for enterprise customers and developers.
        """
    },
    {
        "id": "news_002",
        "title": "Google Announces Gemini: A Multimodal AI System",
        "category": "news",
        "source": "The Verge",
        "date": "2024-01-20",
        "content": """
Google has unveiled Gemini, its most capable AI model to date, designed to be natively 
multimodal from the ground up. Unlike previous models that were adapted for multiple 
modalities, Gemini was trained on text, images, video, audio, and code simultaneously, 
allowing for more sophisticated understanding and reasoning across different types of data.

Gemini comes in three versions: Gemini Ultra for highly complex tasks, Gemini Pro for 
scaling across a wide range of tasks, and Gemini Nano for on-device applications. In 
benchmark tests, Gemini Ultra exceeded human expert performance on MMLU (Massive Multitask 
Language Understanding), which tests knowledge and problem-solving abilities across 57 subjects.

The model demonstrates exceptional performance in mathematics, physics, and coding tasks. 
It can understand and generate high-quality code in popular programming languages and can 
reason across text and images simultaneously. Google plans to integrate Gemini into various 
products including Search, Chrome, and Android devices, bringing advanced AI capabilities 
directly to billions of users worldwide.
        """
    },
    {
        "id": "news_003",
        "title": "AI Regulation: EU Passes Landmark Artificial Intelligence Act",
        "category": "news",
        "source": "Reuters",
        "date": "2024-01-25",
        "content": """
The European Union has passed comprehensive AI regulation legislation, establishing the 
world's first comprehensive legal framework for artificial intelligence. The AI Act 
categorizes AI systems based on risk levels and imposes corresponding obligations on 
developers and deployers.

High-risk AI systems, including those used in critical infrastructure, education, 
employment, and law enforcement, will face stringent requirements including risk 
assessments, data governance, technical documentation, and human oversight. The regulation 
explicitly bans certain AI applications deemed unacceptable, such as social scoring systems, 
real-time biometric identification in public spaces (with limited exceptions), and AI systems 
that manipulate human behavior.

General-purpose AI models like GPT-4 and Claude face specific transparency requirements, 
including disclosure of training data sources and compliance with copyright law. Foundation 
model providers must conduct thorough risk assessments and report serious incidents. The 
regulation includes substantial penalties for non-compliance, with fines up to 35 million 
euros or 7% of global annual turnover. The Act will be phased in over two years, with bans 
on prohibited practices taking effect within six months.
        """
    },
    {
        "id": "news_004",
        "title": "AI Startups Raise Record $50 Billion in 2024",
        "category": "news",
        "source": "Bloomberg",
        "date": "2024-02-01",
        "content": """
Artificial intelligence startups raised a record-breaking $50 billion in venture capital 
funding in 2024, marking a 150% increase from the previous year. The surge in investment 
reflects growing confidence in AI's transformative potential across industries and the 
success of early AI applications.

Generative AI companies led the funding boom, with firms focused on text, image, video, 
and code generation attracting significant capital. Enterprise AI solutions for healthcare, 
finance, and manufacturing also saw substantial investments. Notable deals include 
Anthropic's $7 billion Series C, Cohere's $4.5 billion Series D, and numerous smaller 
rounds for specialized AI applications.

Investors are particularly interested in companies building AI infrastructure, including 
vector databases, model optimization tools, and AI security solutions. The trend extends 
beyond traditional tech hubs, with AI startups in Europe, Asia, and emerging markets 
receiving unprecedented attention. However, experts caution that the market may be 
overheated, drawing comparisons to previous tech bubbles and emphasizing the importance 
of sustainable business models and real-world value creation.
        """
    },
    {
        "id": "news_005",
        "title": "Breakthrough in AI Safety: New Alignment Techniques Show Promise",
        "category": "news",
        "source": "MIT Technology Review",
        "date": "2024-02-10",
        "content": """
Researchers at leading AI labs have announced significant progress in AI alignment 
techniques, addressing one of the field's most critical challenges. The new methods, 
called Constitutional AI and Debate-based Training, aim to ensure AI systems behave in 
accordance with human values and intentions even as they become more powerful.

Constitutional AI involves training models to follow explicit principles and values, 
creating a framework for ethical decision-making. In testing, models trained with this 
approach showed dramatically reduced rates of harmful outputs while maintaining 
performance on beneficial tasks. The technique has been successfully applied to large 
language models with over 100 billion parameters.

Debate-based training pits multiple AI systems against each other to identify flaws in 
reasoning and potential failure modes. This adversarial approach has proven effective at 
uncovering edge cases and vulnerabilities that traditional testing methods miss. Early 
results suggest these techniques could scale to future AI systems orders of magnitude 
more capable than current models, providing a pathway to safer artificial general intelligence.
        """
    }
]
tweets = [
    {
        "id": "tweet_001",
        "username": "@AIResearcher",
        "category": "tweet",
        "date": "2024-01-16",
        "likes": 15234,
        "retweets": 3421,
        "content": """
Just tested GPT-4 Turbo on our internal benchmark suite. The improvement in logical 
reasoning is remarkable - it's now solving problems that stumped GPT-4. The larger 
context window is a game-changer for document analysis. üöÄ #AI #MachineLearning
        """
    },
    {
        "id": "tweet_002",
        "username": "@DataScientistPro",
        "category": "tweet",
        "date": "2024-01-18",
        "likes": 8932,
        "retweets": 2156,
        "content": """
RAG (Retrieval Augmented Generation) is the most underrated AI technique right now. 
It's solving the hallucination problem while keeping costs reasonable. Every company 
should be exploring this for their AI applications. Thread üßµüëá
        """
    },
    {
        "id": "tweet_003",
        "username": "@MLEngineer",
        "category": "tweet",
        "date": "2024-01-22",
        "likes": 12445,
        "retweets": 2891,
        "content": """
Hot take: Vector databases will be as important as traditional databases in 3 years. 
The infrastructure layer for AI is forming right now, and semantic search is just the 
beginning. Pinecone, Weaviate, and Chroma are leading the charge. #VectorDB #AI
        """
    },
    {
        "id": "tweet_004",
        "username": "@TechFounder",
        "category": "tweet",
        "date": "2024-01-25",
        "likes": 23567,
        "retweets": 5432,
        "content": """
We built an AI customer support system using LangChain + Pinecone + GPT-4. Results 
after 2 months:
- 70% tickets resolved without human intervention
- Response time down from 4hrs to 30 seconds
- Customer satisfaction up 40%
- Support costs down 60%

AI is real. üìà
        """
    },
    {
        "id": "tweet_005",
        "username": "@AIEthicist",
        "category": "tweet",
        "date": "2024-01-28",
        "likes": 9823,
        "retweets": 3214,
        "content": """
The EU AI Act is a watershed moment. While some see it as restrictive, it's actually 
providing the clarity businesses need to invest confidently in AI. Responsible AI isn't 
just ethical - it's good business. #AIRegulation #ResponsibleAI
        """
    },
    {
        "id": "tweet_006",
        "username": "@DeepLearningDaily",
        "category": "tweet",
        "date": "2024-02-02",
        "likes": 18234,
        "retweets": 4123,
        "content": """
Transformer architecture turns 7 years old next month. In that time, it's:
- Revolutionized NLP
- Enabled GPT, BERT, and modern LLMs
- Extended to vision (ViT) and multimodal models
- Changed the entire AI landscape

One paper to rule them all. üß† #Transformers
        """
    },
    {
        "id": "tweet_007",
        "username": "@StartupAI",
        "category": "tweet",
        "date": "2024-02-05",
        "likes": 14567,
        "retweets": 3789,
        "content": """
Stop building "ChatGPT wrappers" and start building real AI products:
1. Solve a specific problem deeply
2. Fine-tune or use RAG for domain expertise
3. Build defensible data moats
4. Focus on user experience, not just the model

The AI gold rush needs more pickaxe sellers. ‚õèÔ∏è
        """
    },
    {
        "id": "tweet_008",
        "username": "@PromptEngineer",
        "category": "tweet",
        "date": "2024-02-08",
        "likes": 11234,
        "retweets": 2567,
        "content": """
Prompt engineering tips that actually work:
- Be specific and detailed
- Use examples (few-shot learning)
- Break complex tasks into steps
- Ask the model to explain its reasoning
- Iterate based on outputs

Treat it like teaching a smart intern. üí° #PromptEngineering
        """
    }
]

In [6]:
blog_posts = [
    {
        "id": "blog_001",
        "title": "Understanding RAG: The Future of AI-Powered Applications",
        "category": "blog",
        "author": "Sarah Chen",
        "website": "AI Engineering Blog",
        "date": "2024-01-12",
        "tags": ["RAG", "LLM", "Vector Database", "AI Engineering"],
        "word_count": 847,
        "reading_time_minutes": 5,
        "sentiment": "positive",
        "difficulty": "intermediate",
        "content": """
Retrieval-Augmented Generation (RAG) has emerged as one of the most practical and powerful 
techniques for building AI applications. In this comprehensive guide, we'll explore what 
RAG is, why it matters, and how to implement it effectively.

What is RAG?

RAG combines the power of large language models with external knowledge retrieval. Instead 
of relying solely on the information encoded in the model's parameters during training, 
RAG systems retrieve relevant information from a knowledge base and use it to generate 
more accurate, up-to-date, and contextually appropriate responses.

The RAG Architecture

A typical RAG system consists of three main components:

1. Document Processing: Documents are split into chunks, converted into embeddings using 
models like OpenAI's text-embedding-3-small, and stored in a vector database such as 
Pinecone, Weaviate, or Chroma.

2. Retrieval: When a user asks a question, the query is embedded and used to find the 
most relevant document chunks through semantic similarity search.

3. Generation: The retrieved context is combined with the user's query and sent to a 
large language model (like GPT-4, Claude, or Llama 2) to generate a grounded response.

Why RAG Matters

RAG addresses several key limitations of standalone LLMs:
- Reduces hallucinations by grounding responses in factual sources
- Enables access to current information without retraining
- Allows for domain-specific knowledge without fine-tuning
- Provides transparency through source attribution
- More cost-effective than constantly retraining models

Implementation Best Practices

Based on deploying dozens of RAG systems in production, here are key best practices:

Chunking Strategy: Use semantic chunking rather than fixed-size splits. Aim for chunks 
between 500-1000 tokens with 10-20% overlap to maintain context.

Embedding Models: Choose models based on your use case. OpenAI's embeddings offer great 
quality, while open-source options like all-MiniLM-L6-v2 provide cost savings for 
high-volume applications.

Retrieval Configuration: Start with k=3-5 retrieved chunks. Use MMR (Maximum Marginal 
Relevance) for diverse results. Consider hybrid search combining semantic and keyword-based 
retrieval for better precision.

Prompt Engineering: Include clear instructions for using retrieved context. Ask the model 
to cite sources and admit when information is insufficient. Structure prompts to minimize 
hallucination.

Evaluation: Measure both retrieval quality (recall, precision) and generation quality 
(accuracy, relevance, coherence). Use human evaluation alongside automated metrics.

Common Pitfalls to Avoid

1. Poor chunking leading to incomplete context
2. Using too many or too few retrieved chunks
3. Not handling retrieval failures gracefully
4. Ignoring latency optimization
5. Insufficient testing on edge cases

The Future of RAG

As RAG systems mature, we're seeing exciting developments:
- Agentic RAG with iterative retrieval
- Multi-modal RAG combining text, images, and structured data
- Conversational RAG maintaining context across turns
- Self-RAG where models evaluate and refine retrievals

RAG is not just a temporary solution - it represents a fundamental approach to building 
reliable, maintainable AI systems. Whether you're building a customer support chatbot, 
a research assistant, or an enterprise knowledge base, RAG should be in your toolkit.
        """
    },
    {
        "id": "blog_002",
        "title": "Vector Databases Explained: A Comprehensive Guide",
        "category": "blog",
        "author": "Michael Rodriguez",
        "website": "Database Insider",
        "date": "2024-01-18",
        "tags": ["Vector Database", "Embeddings", "Semantic Search", "Infrastructure"],
        "word_count": 923,
        "reading_time_minutes": 6,
        "sentiment": "neutral",
        "difficulty": "intermediate",
        "content": """
Vector databases have become essential infrastructure for AI applications. This guide 
covers everything you need to know about vector databases, from basic concepts to 
production deployment.

What Are Vector Databases?

Vector databases are specialized systems designed to store, index, and query high-dimensional 
vector embeddings. Unlike traditional databases that store and query structured data, vector 
databases excel at semantic similarity search, finding items that are conceptually similar 
even if they don't share exact keywords.

How Vector Embeddings Work

Embeddings are dense numerical representations of data (text, images, audio) that capture 
semantic meaning. Similar items have similar embeddings, which can be measured using distance 
metrics like cosine similarity, Euclidean distance, or dot product. For example, the 
embeddings for "king" and "monarch" would be very close in vector space, while "king" and 
"banana" would be distant.

Key Features of Vector Databases

Modern vector databases provide several critical capabilities:

Approximate Nearest Neighbor (ANN) Search: Efficiently find similar vectors among millions 
or billions of entries using algorithms like HNSW (Hierarchical Navigable Small World) or 
IVF (Inverted File Index).

Metadata Filtering: Combine semantic search with traditional filtering, like finding similar 
documents from a specific date range or category.

Scalability: Handle massive datasets with horizontal scaling and distributed architectures.

Real-time Updates: Support continuous ingestion and immediate availability of new vectors.

Hybrid Search: Combine vector similarity with keyword search and business logic.

Popular Vector Databases

Pinecone: Fully managed, serverless vector database with excellent performance and ease of 
use. Great for production applications requiring high availability.

Weaviate: Open-source vector database with strong GraphQL support and multimodal capabilities. 
Offers both cloud and self-hosted options.

Chroma: Lightweight, embeddable vector database perfect for development and small to 
medium-scale applications. Easy to get started with.

Qdrant: High-performance open-source vector database with advanced filtering and a focus on 
efficiency. Good for self-hosted deployments.

Milvus: Highly scalable open-source vector database designed for massive datasets and high 
throughput scenarios.

Use Cases

Vector databases power numerous AI applications:
- Semantic search engines
- Recommendation systems
- RAG systems for question answering
- Duplicate detection and deduplication
- Anomaly detection
- Image and video similarity search
- Personalization engines

Performance Considerations

When choosing a vector database, consider:

Latency: Query response time is critical for user-facing applications. Aim for sub-100ms 
latency for most use cases.

Throughput: How many queries per second can the system handle? This affects cost and 
scalability.

Accuracy vs Speed: ANN algorithms trade perfect accuracy for speed. Configure based on your 
tolerance for approximate results.

Cost: Factor in storage costs, compute costs, and data transfer costs. Serverless options 
can be more economical for variable workloads.

Best Practices for Production

1. Choose appropriate embedding dimensions (384-1536 typically)
2. Implement retry logic and error handling
3. Monitor query latency and adjust ANN parameters
4. Use metadata filtering to narrow search space
5. Batch operations when possible for efficiency
6. Implement caching for frequently accessed vectors
7. Plan for data growth and scaling needs

The vector database market is rapidly evolving, with new features and optimizations 
appearing regularly. As AI applications become more sophisticated, vector databases will 
play an increasingly central role in the infrastructure stack.
        """
    },
    {
        "id": "blog_003",
        "title": "Prompt Engineering: From Basics to Advanced Techniques",
        "category": "blog",
        "author": "Emily Watson",
        "website": "AI Practitioner",
        "date": "2024-01-24",
        "tags": ["Prompt Engineering", "LLM", "AI", "Best Practices"],
        "word_count": 1056,
        "reading_time_minutes": 7,
        "sentiment": "positive",
        "difficulty": "beginner",
        "content": """
Prompt engineering has evolved from a curiosity to a critical skill for anyone working with 
large language models. This guide covers fundamental principles and advanced techniques for 
getting the best results from AI models.

The Foundation: Clear Communication

The most important principle in prompt engineering is clarity. LLMs are powerful but literal 
- they respond to exactly what you ask, not what you meant to ask. Start with these basics:

Be Specific: Instead of "Write about dogs," try "Write a 500-word article about golden 
retrievers, focusing on their temperament, exercise needs, and suitability for families 
with children."

Provide Context: Include relevant background information. "You are an experienced software 
architect reviewing code for security vulnerabilities" sets expectations for the model's 
perspective and expertise level.

Define the Output Format: Specify exactly how you want the response structured. "Provide 
your answer as a JSON object with keys for 'summary', 'pros', 'cons', and 'recommendation'" 
removes ambiguity.

Few-Shot Learning

One of the most powerful techniques is providing examples of desired input-output pairs:

Example:
Classify the sentiment of these reviews:

Review: "This product exceeded my expectations!"
Sentiment: Positive

Review: "Terrible quality, broke after one use."
Sentiment: Negative

Review: "The new software update is amazing!"
Sentiment: [model completes]

Few-shot prompting dramatically improves accuracy, especially for specialized or nuanced 
tasks. Use 2-5 examples for most tasks, more for complex scenarios.

Chain-of-Thought Prompting

For reasoning tasks, ask the model to show its work:

"Solve this problem step by step, explaining your reasoning at each stage:
If a train travels 120 miles in 2 hours, then stops for 30 minutes, then travels another 
90 miles in 1.5 hours, what is its average speed including the stop?"

This technique significantly improves performance on mathematical, logical, and analytical 
tasks.

Role-Playing and Perspective

Assigning a role or perspective can dramatically change output quality:

"You are a senior financial analyst with 15 years of experience in tech startups. Analyze 
this company's balance sheet and provide investment recommendations."

The model will adopt appropriate vocabulary, consider relevant factors, and provide more 
sophisticated analysis.

Constraints and Guardrails

Add explicit constraints to prevent unwanted behaviors:

- "Do not include any information not explicitly stated in the provided context"
- "If you're uncertain, say 'I don't know' rather than guessing"
- "Avoid technical jargon; explain as if to a high school student"
- "Limit your response to 3 bullet points"

Iterative Refinement

Treat prompt engineering as an iterative process:

1. Start with a basic prompt
2. Analyze the output for issues
3. Refine the prompt to address specific problems
4. Test with multiple inputs
5. Document what works

Advanced Techniques

Self-Consistency: Generate multiple responses and aggregate them for more reliable results 
on critical tasks.

Constitutional AI: Define explicit principles and values for the model to follow, creating 
ethical and aligned outputs.

Prompt Chaining: Break complex tasks into subtasks, using the output of one prompt as input 
to the next.

Meta-Prompting: Have the model help optimize its own prompts through iterative feedback.

Common Mistakes to Avoid

1. Vague instructions leading to inconsistent outputs
2. Overloading prompts with too many requirements
3. Not providing sufficient context
4. Failing to specify output format
5. Not testing with edge cases
6. Ignoring token limits and costs

Tools and Frameworks

Modern prompt engineering benefits from frameworks like:
- LangChain for complex prompt chains and workflows
- Guidance for structured output generation
- OpenAI's Playground for rapid experimentation
- Custom testing harnesses for evaluation

The Future of Prompting

As models become more capable, prompt engineering is evolving from an art to an engineering 
discipline. Expect to see:
- Automated prompt optimization
- Standard libraries of proven prompts
- Better tools for testing and validation
- Integration with software development practices

Mastering prompt engineering multiplies your effectiveness with AI systems. Invest time in 
learning these techniques - it's one of the highest-leverage skills in modern technology.
        """
    },
    {
        "id": "blog_004",
        "title": "Building Production-Ready LLM Applications: Lessons Learned",
        "category": "blog",
        "author": "David Kim",
        "website": "Engineering at Scale",
        "date": "2024-02-03",
        "tags": ["Production", "LLM", "DevOps", "Reliability"],
        "word_count": 1234,
        "reading_time_minutes": 8,
        "sentiment": "neutral",
        "difficulty": "advanced",
        "content": """
After deploying large language model applications serving millions of users, we've learned 
valuable lessons about what it takes to build production-ready AI systems. Here are the 
key insights from our journey.

Architecture Decisions

The first major decision is choosing between API-based models (OpenAI, Anthropic, Cohere) 
and self-hosted open-source models (Llama 2, Mistral, Falcon). API-based solutions offer 
faster time-to-market and zero infrastructure overhead, while self-hosted options provide 
more control and potentially lower costs at scale.

We started with OpenAI's API and later added self-hosted models for specific use cases. 
This hybrid approach optimized for both quality and cost. Critical user-facing features 
use premium API models for best results, while background processing uses efficient 
open-source alternatives.

Reliability and Error Handling

LLMs can fail in various ways: API timeouts, rate limits, model errors, or inappropriate 
outputs. Production systems need comprehensive error handling:

Implement exponential backoff with jitter for retries. Use circuit breakers to prevent 
cascading failures. Have fallback strategies when primary models are unavailable. Log all 
failures for analysis and improvement.

We maintain a hot standby with a different provider. If OpenAI experiences issues, we 
automatically route traffic to Anthropic or our self-hosted models, ensuring continuity.

Cost Management

LLM costs can spiral quickly without proper controls:

Cache responses for repeated queries - we achieved a 60% cache hit rate, dramatically 
reducing costs. Implement rate limiting per user to prevent abuse. Use cheaper models for 
simpler tasks - not everything needs GPT-4. Monitor token usage and set budget alerts.

Our caching strategy alone saved $50,000 monthly. Redis stores embeddings and common 
responses, with TTLs based on content type. Breaking up requests into smaller chunks and 
reusing computed embeddings provided additional savings.

Latency Optimization

Users expect fast responses. Our optimizations:

Stream responses instead of waiting for completion - users see output within 500ms. 
Parallel process independent subtasks. Optimize prompt lengths to reduce processing time. 
Use faster models when appropriate. Implement CDN caching for static context.

We reduced median response time from 8 seconds to 1.2 seconds through these optimizations, 
dramatically improving user satisfaction.

Quality Assurance

Testing LLM applications is challenging due to non-deterministic outputs:

Create comprehensive test suites with expected output patterns. Use LLMs to evaluate LLM 
outputs - GPT-4 can assess response quality automatically. Implement human review for 
critical applications. Track quality metrics over time. A/B test prompt variations.

We built an evaluation framework using GPT-4 to score responses on accuracy, relevance, 
and helpfulness. This automated testing reduced manual QA time by 80% while improving 
consistency.

Security and Safety

Production LLM applications face unique security challenges:

Implement input validation to prevent prompt injection attacks. Filter outputs for 
inappropriate content. Use rate limiting to prevent abuse. Sanitize user data before 
storage. Monitor for data exfiltration attempts.

We experienced attempted prompt injection attacks within the first week of launch. 
Comprehensive input sanitization and output filtering caught 99.7% of malicious attempts.

Observability

Understanding system behavior is crucial:

Log all prompts and completions (with proper data handling). Track token usage, latency, 
and costs per request. Monitor model quality metrics. Set up alerts for anomalies. Create 
dashboards for key metrics.

Our monitoring caught a subtle prompt regression that reduced accuracy by 8% - something 
manual testing had missed. Continuous monitoring is non-negotiable.

User Experience

The best technical implementation fails without good UX:

Set clear expectations about AI capabilities and limitations. Provide feedback during 
processing. Allow users to refine queries easily. Show sources and reasoning when possible. 
Enable easy reporting of issues.

We added a "regenerate" button and thumbs up/down feedback, which provided invaluable data 
for improvement while empowering users.

Continuous Improvement

LLM applications require ongoing optimization:

Analyze failure modes and update prompts. Fine-tune models on production data. Implement 
user feedback loops. Stay current with new models and techniques. Regularly audit for bias 
and fairness issues.

We improved our system's accuracy by 23% over six months through continuous refinement 
based on production data and user feedback.

Key Takeaways

Building production LLM applications requires more than just calling an API. Success demands 
careful architecture, robust error handling, cost management, quality assurance, security 
measures, and continuous improvement. Treat LLM integration as you would any critical system 
component - with rigorous engineering practices and thorough testing.

The AI landscape evolves rapidly, but these fundamental principles remain constant. Build 
with reliability, security, and user experience at the forefront, and your LLM applications 
will deliver lasting value.
        """
    },
    {
        "id": "blog_005",
        "title": "Fine-Tuning vs RAG: Choosing the Right Approach",
        "category": "blog",
        "author": "Jennifer Martinez",
        "website": "AI Engineering Blog",
        "date": "2024-02-10",
        "tags": ["Fine-Tuning", "RAG", "LLM", "Model Training"],
        "word_count": 892,
        "reading_time_minutes": 6,
        "sentiment": "neutral",
        "difficulty": "intermediate",
        "content": """
One of the most common questions when building AI applications is whether to use RAG or 
fine-tune a model. The answer isn't always straightforward, and often the best solution 
involves both. Let's explore when to use each approach.

Understanding the Differences

RAG (Retrieval-Augmented Generation) enhances model responses by retrieving relevant 
information from external sources at inference time. Fine-tuning modifies the model's 
weights through additional training on domain-specific data. These approaches solve 
different problems and have distinct trade-offs.

When to Use RAG

RAG excels in scenarios where:

Dynamic Information: Your knowledge base changes frequently. Product catalogs, news, 
documentation, and policies benefit from RAG since you can update the retrieval corpus 
without retraining.

Transparency Required: RAG provides source attribution, showing users where information 
comes from. This is crucial for legal, medical, or financial applications.

Lower Technical Overhead: RAG requires less ML expertise than fine-tuning. You don't need 
training infrastructure or extensive datasets.

Cost Constraints: Fine-tuning can be expensive, especially for large models. RAG's ongoing 
costs are more predictable and often lower.

Broad Knowledge Needs: When your application needs to access diverse information that 
would require massive training data to fine-tune.

When to Use Fine-Tuning

Fine-tuning is better when:

Consistent Style/Format: You need the model to always respond in a specific way, using 
particular terminology or following strict formatting rules.

Specialized Tasks: Domain-specific tasks like medical diagnosis, legal analysis, or 
technical code generation benefit from fine-tuning on expert data.

Low Latency Critical: Fine-tuned models don't need retrieval steps, reducing latency. This 
matters for real-time applications.

Proprietary Knowledge: When your competitive advantage comes from specialized knowledge 
embedded in the model itself.

Limited Context Windows: If your use case requires more context than fits in a prompt, 
fine-tuning can encode that knowledge in weights.

The Hybrid Approach

Many successful applications combine both:

Fine-tune a base model on your domain to establish baseline knowledge and communication 
style. Then use RAG on top for dynamic, factual information. This gives you the benefits 
of both approaches.

Example: A customer service bot might be fine-tuned on your company's communication style 
and common workflows, while using RAG to retrieve current product specs, pricing, and 
policies.

Practical Considerations

Data Requirements:
- RAG: Needs organized, retrievable documents. Quality matters more than quantity.
- Fine-tuning: Requires hundreds to thousands of high-quality training examples.

Cost Structure:
- RAG: Ongoing inference costs for embeddings and retrieval. Storage costs for vector DB.
- Fine-tuning: Upfront training costs. Potentially lower per-request costs afterward.

Maintenance:
- RAG: Easy to update - just modify the document corpus.
- Fine-tuning: Requires retraining for updates, which can be expensive and time-consuming.

Complexity:
- RAG: Simpler to implement initially but requires managing retrieval infrastructure.
- Fine-tuning: Requires ML expertise and training infrastructure.

Decision Framework

Ask yourself these questions:

1. How often does the underlying information change?
2. Do you need source attribution?
3. What's your ML team's expertise level?
4. What's your budget for training vs. inference?
5. How critical is response latency?
6. Do you need consistent style/format?
7. How much training data do you have?

Common Mistakes

Don't fine-tune when RAG would suffice: Fine-tuning is overkill for many use cases. Start 
with RAG unless you have specific reasons to fine-tune.

Not considering hybrid approaches: Many assume it's either/or. Combining both often yields 
the best results.

Underestimating maintenance: Fine-tuned models need regular updates as information changes. 
Plan for ongoing retraining.

Ignoring evaluation: Rigorously test both approaches on your specific use case before 
committing.

Getting Started

Start with RAG for most applications. It's faster to implement, easier to iterate on, and 
provides transparency. Once you've validated your use case and gathered production data, 
consider whether fine-tuning would provide meaningful improvements.

If you do fine-tune, start small. Fine-tune smaller models first to validate your approach 
before investing in training larger models. Use parameter-efficient techniques like LoRA 
to reduce costs.

The Future

The line between RAG and fine-tuning is blurring. Techniques like retrieval-enhanced 
fine-tuning and continuous learning systems combine both approaches. As models improve and 
tools mature, implementing hybrid solutions will become easier.

Choose based on your specific requirements, not trends or hype. The best approach depends 
on your use case, resources, and constraints. Both RAG and fine-tuning are powerful tools 
- understanding when to use each is key to building effective AI applications.
        """
    },
    {
        "id": "blog_006",
        "title": "LangChain Deep Dive: Building Complex AI Workflows",
        "category": "blog",
        "author": "Alex Thompson",
        "website": "Developer's Corner",
        "date": "2024-02-17",
        "tags": ["LangChain", "Python", "AI Framework", "Agents"],
        "word_count": 1045,
        "reading_time_minutes": 7,
        "sentiment": "positive",
        "difficulty": "intermediate",
        "content": """
LangChain has become the go-to framework for building sophisticated LLM applications. This 
deep dive explores its architecture, key components, and best practices for production use.

What is LangChain?

LangChain is an open-source framework for building applications powered by large language 
models. It provides abstractions and tools for common patterns like prompt management, 
chain composition, memory management, and agent creation. Think of it as the Rails or 
Django of LLM development - opinionated patterns that accelerate development.

Core Concepts

Models: LangChain supports multiple LLM providers (OpenAI, Anthropic, Hugging Face, etc.) 
through a unified interface. Switch providers without rewriting code.

Prompts: Reusable prompt templates with variable substitution. Version control your prompts 
alongside code.

Chains: Sequence multiple LLM calls and operations. Output from one step becomes input to 
the next.

Agents: LLMs that can use tools and make decisions about which actions to take based on 
user input.

Memory: Persist state across interactions for conversational applications.

Building Blocks

Prompt Templates:
```python
from langchain import PromptTemplate

template = PromptTemplate(
    input_variables=["product", "features"],
    template="Write a product description for {product} highlighting {features}"
)
```

Templates ensure consistency and make prompts maintainable. Use them for all but the 
simplest one-off queries.

Chains:

Simple chains connect operations linearly. Use LLMChain for basic prompt-to-response flows. 
SequentialChain runs multiple chains in sequence, passing outputs as inputs. More complex 
applications use custom chains with conditional logic.

Example use case: A chain that summarizes a document, extracts key points, then generates 
social media posts based on those points.

Agents:

Agents are the most powerful but complex LangChain feature. An agent uses an LLM to decide 
which tools to use and in what order. Tools can be APIs, databases, calculators, search 
engines, or custom functions.

Agent types include:
- Zero-shot ReAct: Decides tool usage based on descriptions
- Conversational: Maintains dialogue context
- OpenAI Functions: Uses function calling for tool selection
- Plan-and-Execute: Creates plans before execution

Agents enable building assistants that can accomplish complex, multi-step tasks 
autonomously.

Memory Management

LLMs are stateless - they don't remember previous interactions. LangChain provides memory 
types to maintain context:

ConversationBufferMemory: Stores entire conversation history
ConversationSummaryMemory: Summarizes old messages to save tokens
ConversationBufferWindowMemory: Keeps only recent N interactions
VectorStoreMemory: Stores conversations in vector DB for semantic retrieval

Choose based on your context window limits and use case requirements.

RAG with LangChain

LangChain simplifies RAG implementation:

```python
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA

vectorstore = Chroma.from_documents(documents, OpenAIEmbeddings())
qa = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(),
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)
```

This pattern works for most RAG applications. Customize retrieval parameters and chain 
types for specific needs.

Production Considerations

Caching: LangChain supports caching to reduce costs and improve latency. Enable for 
repeated queries.

Callbacks: Hook into chain execution for logging, monitoring, and debugging. Essential for 
production observability.

Error Handling: LLMs can fail unexpectedly. Implement retry logic and fallbacks. LangChain 
provides utilities for this.

Token Management: Monitor and limit token usage to control costs. Use the token counting 
utilities.

Async Support: For high-throughput applications, use LangChain's async interfaces to 
improve performance.

Common Pitfalls

Over-chaining: Don't create complex chains when simple prompts suffice. Each chain step 
adds latency and potential failure points.

Ignoring Costs: Complex chains can consume many tokens quickly. Monitor usage carefully.

Not Testing Individual Components: Test each chain component independently before 
composing them.

Premature Agent Usage: Agents are powerful but add complexity and unpredictability. Start 
with simpler approaches.

Alternatives and Comparisons

While LangChain is popular, consider alternatives:

LlamaIndex: Better for RAG-focused applications with sophisticated retrieval needs.

Semantic Kernel: Microsoft's framework with strong .NET support.

Haystack: Focuses on NLP pipelines and document processing.

Custom Solutions: For simple use cases, direct API calls might be clearer than framework 
abstractions.

Choose based on your team's expertise, use case requirements, and ecosystem preferences.

Advanced Patterns

Self-Ask with Search: Agent asks itself clarifying questions and searches for answers.

Constitutional AI: Define principles that guide agent behavior and decision-making.

Multi-Agent Systems: Multiple specialized agents collaborate on complex tasks.

Human-in-the-Loop: Pause execution for human review before critical actions.

Getting Started

Begin with simple chains before moving to complex agents. LangChain's abstractions make 
sense once you understand basic LLM patterns. If you're new to LLMs, build a few simple 
applications with direct API calls first.

The documentation is comprehensive but can be overwhelming. Start with the quickstart 
guides and build progressively more complex applications.

The Future

LangChain evolves rapidly, with new features and improvements released frequently. The 
framework is moving toward better production-readiness with improved monitoring, evaluation 
tools, and deployment patterns.

LangSmith, the companion platform, provides hosted services for prompt management, 
evaluation, and monitoring. Consider it for production applications.

LangChain accelerates LLM application development but isn't magic. It's a tool that 
requires understanding both its abstractions and the underlying LLM concepts. Master both 
to build powerful, maintainable AI applications.
        """
    }
]

In [7]:
type(blog_posts)

list

In [8]:
print(news_articles[0].keys())
print(len(news_articles))

dict_keys(['id', 'title', 'category', 'source', 'date', 'content'])
5


In [9]:

# Convert news articles to Document objects 
news_articles_doc = []
for article in news_articles:
    doc = Document(
        page_content=article["content"],
        metadata={
            "id": article["id"],
            "title": article["title"],
            "category": article["category"],
            "date": article["date"],
            "source": article["source"]
        }
    )
    news_articles_doc.append(doc)
news_articles_doc

[Document(metadata={'id': 'news_001', 'title': 'OpenAI Releases GPT-4 Turbo with Enhanced Capabilities', 'category': 'news', 'date': '2024-01-15', 'source': 'TechCrunch'}, page_content='\nOpenAI has announced the release of GPT-4 Turbo, an improved version of their flagship \nlanguage model. The new model features a larger context window of 128,000 tokens, \nsignificantly improved performance on complex reasoning tasks, and reduced pricing. \nGPT-4 Turbo can process the equivalent of approximately 300 pages of text in a single \nprompt, making it ideal for analyzing lengthy documents, codebases, and research papers.\n\nThe model also includes improved instruction following and more consistent output formatting. \nOpenAI has reported that GPT-4 Turbo is 40% more accurate on factual questions compared to \nits predecessor and shows reduced hallucination rates. The API pricing has been reduced by \n50% for input tokens and 33% for output tokens, making it more accessible for developers \n

In [10]:
tweets[0].keys()

dict_keys(['id', 'username', 'category', 'date', 'likes', 'retweets', 'content'])

In [11]:
# Convert tweets to Document objects
tweets_doc = []
for tweet in tweets:
    doc = Document(
        page_content=tweet["content"],
        metadata={
            "id": tweet["id"],
            "username": tweet["username"],
            "date": tweet["date"],
            "retweets": tweet["retweets"],
            "likes": tweet["likes"]
        }
    )
    tweets_doc.append(doc)
tweets_doc


[Document(metadata={'id': 'tweet_001', 'username': '@AIResearcher', 'date': '2024-01-16', 'retweets': 3421, 'likes': 15234}, page_content="\nJust tested GPT-4 Turbo on our internal benchmark suite. The improvement in logical \nreasoning is remarkable - it's now solving problems that stumped GPT-4. The larger \ncontext window is a game-changer for document analysis. üöÄ #AI #MachineLearning\n        "),
 Document(metadata={'id': 'tweet_002', 'username': '@DataScientistPro', 'date': '2024-01-18', 'retweets': 2156, 'likes': 8932}, page_content="\nRAG (Retrieval Augmented Generation) is the most underrated AI technique right now. \nIt's solving the hallucination problem while keeping costs reasonable. Every company \nshould be exploring this for their AI applications. Thread üßµüëá\n        "),
 Document(metadata={'id': 'tweet_003', 'username': '@MLEngineer', 'date': '2024-01-22', 'retweets': 2891, 'likes': 12445}, page_content='\nHot take: Vector databases will be as important as tradi

In [12]:
blog_posts[0].keys()

dict_keys(['id', 'title', 'category', 'author', 'website', 'date', 'tags', 'word_count', 'reading_time_minutes', 'sentiment', 'difficulty', 'content'])

In [13]:
blog_posts_doc = []
for blog in blog_posts:
    doc = Document(
        page_content=blog['content'],
        metadata={
            'id': blog['id'],
            'title': blog['title'],
            'category': blog['category'],
            'author': blog['author'],
            'website': blog['website'],
            'date': blog['date'],
            'tags': blog['tags'],
            'word_count': blog['word_count'],
            'reading_time_minutes': blog['reading_time_minutes'],
            'sentiment': blog['sentiment'],
            'difficulty': blog['difficulty']
        }
    )
    blog_posts_doc.append(doc)
blog_posts_doc

[Document(metadata={'id': 'blog_001', 'title': 'Understanding RAG: The Future of AI-Powered Applications', 'category': 'blog', 'author': 'Sarah Chen', 'website': 'AI Engineering Blog', 'date': '2024-01-12', 'tags': ['RAG', 'LLM', 'Vector Database', 'AI Engineering'], 'word_count': 847, 'reading_time_minutes': 5, 'sentiment': 'positive', 'difficulty': 'intermediate'}, page_content="\nRetrieval-Augmented Generation (RAG) has emerged as one of the most practical and powerful \ntechniques for building AI applications. In this comprehensive guide, we'll explore what \nRAG is, why it matters, and how to implement it effectively.\n\nWhat is RAG?\n\nRAG combines the power of large language models with external knowledge retrieval. Instead \nof relying solely on the information encoded in the model's parameters during training, \nRAG systems retrieve relevant information from a knowledge base and use it to generate \nmore accurate, up-to-date, and contextually appropriate responses.\n\nThe RAG 

In [14]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=350,
    length_function=len,
    separators=['\n\n','\n',' ', '']
)
news_article_chunks = text_splitter.split_documents(news_articles_doc)
tweets_chunks = text_splitter.split_documents(tweets_doc)
blog_posts_chunks = text_splitter.split_documents(blog_posts_doc)

In [15]:
print(f"Number of news articles: {len(news_articles_doc)}")
print(f"Number of news article chunks: {len(news_article_chunks)}")
print(f"Number of tweets: {len(tweets_doc)}")
print(f"Number of tweet chunks: {len(tweets_chunks)}")
print(f"Number of blog posts: {len(blog_posts_doc)}")
print(f"Number of blog post chunks: {len(blog_posts_chunks)}")

Number of news articles: 5
Number of news article chunks: 10
Number of tweets: 8
Number of tweet chunks: 8
Number of blog posts: 6
Number of blog post chunks: 43


In [None]:
vector_store = PineconeVectorStore.from_documents(
    documents=news_article_chunks + tweets_chunks + blog_posts_chunks,
    embedding=embeddings,
    index_name=INDEX_NAME
)
print(f"Added {len(news_article_chunks) + len(tweets_chunks) + len(blog_posts_chunks)} documents to Pinecone index '{INDEX_NAME}'")

Added 61 documents to Pinecone index 'rag-udemy'


In [24]:
# Create retriever
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}  # Retrieve top 3 similar chunks
)
retriever

VectorStoreRetriever(tags=['PineconeVectorStore', 'OpenAIEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x16acedd10>, search_kwargs={'k': 3})

In [25]:
# Initlialize LLM
llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    temperature=0,
    max_tokens=1024
)
llm

ChatGroq(profile={'max_input_tokens': 131072, 'max_output_tokens': 32768, 'image_inputs': False, 'audio_inputs': False, 'video_inputs': False, 'image_outputs': False, 'audio_outputs': False, 'video_outputs': False, 'reasoning_output': False, 'tool_calling': True}, client=<groq.resources.chat.completions.Completions object at 0x169b9b4d0>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x300160e90>, model_name='llama-3.3-70b-versatile', temperature=1e-08, model_kwargs={}, groq_api_key=SecretStr('**********'), max_tokens=1024)

In [26]:
# Create a prompt template
prompt_template = """You are a helpful AI assistant. Answer the question based on the provided context.
If you cannot find the answer in the context, say "I don't have enough information to answer that question."

Context:
{context}

Question: {question}
Answer:"""

prompt = ChatPromptTemplate.from_template(prompt_template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='You are a helpful AI assistant. Answer the question based on the provided context.\nIf you cannot find the answer in the context, say "I don\'t have enough information to answer that question."\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:'), additional_kwargs={})])

In [27]:
# Building RAG chain
def format_docs(docs):
    """ Format retrieved documents for context """
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question" : RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
rag_chain

{
  context: VectorStoreRetriever(tags=['PineconeVectorStore', 'OpenAIEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x16acedd10>, search_kwargs={'k': 3})
           | RunnableLambda(format_docs),
  question: RunnablePassthrough()
}
| ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='You are a helpful AI assistant. Answer the question based on the provided context.\nIf you cannot find the answer in the context, say "I don\'t have enough information to answer that question."\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:'), additional_kwargs={})])
| ChatGroq(profile={'max_input_tokens': 131072, 'max_output_tokens': 32768, 'image_inputs': False, 'audio_inputs': False, 'video_inputs': False, 'image_outputs': False, 'audio_outputs': False, 'video_

In [32]:
# Test RAG Chain
query = "What is RAG?"
response = rag_chain.invoke(query)
context = retriever.invoke(query)
print(f"\n"+"=" * 60 + "\n")
print(f"Query: {query}\n")
print(f"Number of context chunks retrieved: {len(context)}\n")
for i, context_chunk in enumerate(context):
    print(f"Context Chunk {i+1}:\n{context_chunk.page_content}\n")
print(f"Response: {response}\n")



Query: What is RAG?

Number of context chunks retrieved: 3

Context Chunk 1:
The Future of RAG

As RAG systems mature, we're seeing exciting developments:
- Agentic RAG with iterative retrieval
- Multi-modal RAG combining text, images, and structured data
- Conversational RAG maintaining context across turns
- Self-RAG where models evaluate and refine retrievals

RAG is not just a temporary solution - it represents a fundamental approach to building 
reliable, maintainable AI systems. Whether you're building a customer support chatbot, 
a research assistant, or an enterprise knowledge base, RAG should be in your toolkit.

Context Chunk 2:
The RAG Architecture

A typical RAG system consists of three main components:

1. Document Processing: Documents are split into chunks, converted into embeddings using 
models like OpenAI's text-embedding-3-small, and stored in a vector database such as 
Pinecone, Weaviate, or Chroma.

2. Retrieval: When a user asks a question, the query is embedded

In [36]:
# Batch queries
queries = [
    "What are the benefits of using RAG in AI applications?",
    "Explain the key features of vector databases.",
    "What are some best practices for prompt engineering?"
]

contexts = retriever.batch(queries)
responses = rag_chain.batch(queries)

# Display results
results = {
    query: {"context": context, "response": response }
    for query, context, response in zip(queries, contexts, responses)
}

print(f"\n"+"=" * 60 + "\n")
print("BATCH PROCESSING RESULTS:\n")
print("=" * 60 + "\n")

for query, result in results.items():
    print(f"Query: {query}\n")
    print(f"Number of context chunks retrieved: {len(result['context'])}\n")
    for i, context_chunk in enumerate(result['context']):
        print(f"Context Chunk {i+1}:\n{context_chunk.page_content}\n")
    print("-" * 60 + "\n")
    print(f"Response: {result['response']}\n")
    print("-" * 60 + "\n")




BATCH PROCESSING RESULTS:


Query: What are the benefits of using RAG in AI applications?

Number of context chunks retrieved: 3

Context Chunk 1:
The Future of RAG

As RAG systems mature, we're seeing exciting developments:
- Agentic RAG with iterative retrieval
- Multi-modal RAG combining text, images, and structured data
- Conversational RAG maintaining context across turns
- Self-RAG where models evaluate and refine retrievals

RAG is not just a temporary solution - it represents a fundamental approach to building 
reliable, maintainable AI systems. Whether you're building a customer support chatbot, 
a research assistant, or an enterprise knowledge base, RAG should be in your toolkit.

Context Chunk 2:
Retrieval-Augmented Generation (RAG) has emerged as one of the most practical and powerful 
techniques for building AI applications. In this comprehensive guide, we'll explore what 
RAG is, why it matters, and how to implement it effectively.

What is RAG?

RAG combines the power 

In [41]:
# Similarity search

### Direct similarity search
search_results = vector_store.similarity_search(
    "What is RAG?", k=3
)
print(f"\n{'='*60}\n")
print("SIMILARITY SEARCH RESULTS:")
print("="*60)

for i, doc in enumerate(search_results):
    print(f"\nResult {i+1}:")
    print(doc.page_content[:200])
    print(f"Metadata: {doc.metadata}")

### Similarity search with scores
search_results_with_scores = vector_store.similarity_search_with_score(
    "What is RAG?", k=3
)
print(f"\n{'='*60}\n")
print("SIMILARITY SEARCH WITH SCORES RESULTS:")
print("="*60)

for doc, score in search_results_with_scores:
    print(f"Scores: {score:.4f}")
    print(f"Content: {doc.page_content[:200]}")
    print(f"Metadata: {doc.metadata}")




SIMILARITY SEARCH RESULTS:

Result 1:
The Future of RAG

As RAG systems mature, we're seeing exciting developments:
- Agentic RAG with iterative retrieval
- Multi-modal RAG combining text, images, and structured data
- Conversational RAG 
Metadata: {'author': 'Sarah Chen', 'category': 'blog', 'date': '2024-01-12', 'difficulty': 'intermediate', 'id': 'blog_001', 'reading_time_minutes': 5.0, 'sentiment': 'positive', 'tags': ['RAG', 'LLM', 'Vector Database', 'AI Engineering'], 'title': 'Understanding RAG: The Future of AI-Powered Applications', 'website': 'AI Engineering Blog', 'word_count': 847.0}

Result 2:
The RAG Architecture

A typical RAG system consists of three main components:

1. Document Processing: Documents are split into chunks, converted into embeddings using 
models like OpenAI's text-embed
Metadata: {'author': 'Sarah Chen', 'category': 'blog', 'date': '2024-01-12', 'difficulty': 'intermediate', 'id': 'blog_001', 'reading_time_minutes': 5.0, 'sentiment': 'positive', 'tag

In [44]:
# Advanced retrieval - Filtering

filtered_results = vector_store.similarity_search(
    'GPT-4 Turbo',
    k=3,
    filter={'source':{"$eq": 'TechCrunch'}}
)
print(f"\n"+"="*60)
print(f"FILTERED SEARCH RESULTS")
print("="*60+"\n")
for doc in filtered_results:
    print(f"\nContent: {doc.page_content}")
    print(f"\nMetadata: {doc.metadata}")


FILTERED SEARCH RESULTS


Content: OpenAI has announced the release of GPT-4 Turbo, an improved version of their flagship 
language model. The new model features a larger context window of 128,000 tokens, 
significantly improved performance on complex reasoning tasks, and reduced pricing. 
GPT-4 Turbo can process the equivalent of approximately 300 pages of text in a single 
prompt, making it ideal for analyzing lengthy documents, codebases, and research papers.

The model also includes improved instruction following and more consistent output formatting. 
OpenAI has reported that GPT-4 Turbo is 40% more accurate on factual questions compared to 
its predecessor and shows reduced hallucination rates. The API pricing has been reduced by 
50% for input tokens and 33% for output tokens, making it more accessible for developers 
and businesses.

Metadata: {'category': 'news', 'date': '2024-01-15', 'id': 'news_001', 'source': 'TechCrunch', 'title': 'OpenAI Releases GPT-4 Turbo with Enhance

In [54]:
# Utility functions
def create_rag_chain_from_existing_index():
    """ Loading existing Pinecone index """
    vectorstore = PineConeVectorStore(
        index_name=INDEX_NAME,
        embedding=embeddings
    )

    retriever = vectorstore.as_retriever(search_kwargs={'k':3})

    rag_chain = (
        {
            "context": retriever | format_docs,
            "question": RunnablePassthrough()
        }
        | prompt
        | llm
        | StrOutputParser()
    )
    return rag_chain

def add_data(new_docs):
        """ Add more data to vector store """
        vectorstore = PineConeVectorStore(
            index_name=INDEX_NAME,
            embedding=embeddings,
        )
        # Split documents into chunks
        text_splitter=RecursiveCharacterTextSplitter.from_documents(
            chunk_size=1000,
            chunk_overlap=350,
        )
        new_doc_chunks = text_splitter.split_documents(new_docs)

        # Add to vectostore
        vectorstore.add_documents(new_doc_chunks)
        print(f"Added {len(new_doc_chunks)} chunks to the vector store {INDEX_NAME}")
    
def get_index_stats():
        """ Get statistics about the PineCone Index """
        index = pc.Index(INDEX_NAME)
        stats = index.describe_index_stats()
        print(f"\nIndex statistics:")
        print(f" Total vectos: {stats['total_vector_count']}")
        print(f" Dimension: {stats['dimension']}")
        print(f" Index fullness: {stats.get('index_fullness', 'N/A')}")
        return stats
    
def delete_index():
        """ Delete the Pinecone Index """
        pc.delete_index(INDEX_NAME)
        print(f"Index {INDEX_NAME} is deleted! ")
    
def clear_index():
        """ Clear the Pinecone Index """
        index = pc.Index(INDEX_NAME)
        index.delete(delete_all=True)
        print(f"All vectors in the index {INDEX_NAME} are cleared ")


In [55]:
if __name__=="__main__":
    # Test single query
    print("\n"+"="*60)
    print("TEST RAG CHAIN")
    print("="*60 + "\n")

    query = "What is GPT-4 Turbo?"
    result = rag_chain.invoke(query)
    print(f"\nQuery: {query}")
    print(f"Answer: {result}")
    get_index_stats()


TEST RAG CHAIN


Query: What is GPT-4 Turbo?
Answer: GPT-4 Turbo is an improved version of OpenAI's flagship language model, featuring a larger context window, significantly improved performance on complex reasoning tasks, and reduced pricing. It can process approximately 300 pages of text in a single prompt and has improved instruction following, output formatting, and accuracy, with a 40% increase in accuracy on factual questions and reduced hallucination rates compared to its predecessor.

Index statistics:
 Total vectos: 61
 Dimension: 1024
 Index fullness: 0.0


#### PineCone features:
- Serverless Vector database
- Ultra-fast similarity search
- Real-time updates
- Metadata filtering
- Horizontal scaling
- Low latency (p95 < 100ms)

#### GROQ features:
- Ultra-fast inference (500+ tokens/sec)
- Cost-effective
- Multiple open-source models
- High throughput

#### Embedding Dimensions:
- OpenAI text-embedding-3-small: 1536
- OpenAI text-embedding-ada-002: 1536
- HuggingFace all-MiniLM-L6-v2: 384
- HuggingFace all-mpnet-base-v2: 768

#### Note: Dimension must match between embeddings and PineCone index!

#### Search Types:
- Similarity: Basic similarity search
- mmr: Maximum Marginal Relevance (diverse results)

#### Metrics:
- cosine: Good for text (default)
- euclidean: Good for images/audio
- dotproduct: Fast but requires normalized vectors

#### Best Practices:
1. Use serverless for variable workloads
2. Use cosine similarity for text embeddings
3. Set appropriate chunk size (500-1500 tokens)
4. Use metadata filtering for multi-tenant apps
5. Monitor index stats regularly
6. Use namespaces for data isolation