In [8]:
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Now you can access them
api_key = os.getenv('OPENAI_API_KEY')

model = OpenAIEmbeddings(model='text-embedding-3-small', openai_api_key=api_key)
embeddings = model.embed_documents([
  'hi there!',
  'oh, hello!',
])

print(embeddings)


[[-0.015712833032011986, -0.031052187085151672, -0.012031389400362968, -0.02075481228530407, -0.03465360030531883, -0.03214595094323158, 0.02721068076789379, 0.06034368649125099, -0.02008788473904133, -0.04126953333616257, -0.0045784637331962585, -0.02284896746277809, 0.019020799547433853, 0.0008803455275483429, 0.01042409148067236, 0.053434304893016815, -0.004541782662272453, -0.013511969707906246, -0.02516987919807434, 0.06834682822227478, 0.048419006168842316, 0.0008407466812059283, 0.01564614102244377, 0.028597891330718994, 0.03299961984157562, 0.022382117807865143, 0.014792473055422306, 0.025329941883683205, 0.018167130649089813, -0.013158498331904411, 0.02436956577003002, -0.026517074555158615, 0.014859165996313095, -0.027450773864984512, 0.0015847886679694057, -0.01431228406727314, -0.010217343457043171, 0.031398989260196686, -0.0036214215215295553, -0.038841910660266876, -0.0031495695002377033, -0.013105143792927265, 0.03161240741610527, 0.011271090246737003, -0.039615549147129

In [9]:
# Setup PGVector and SQLRecordManager for RAG indexing
import uuid
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Get database connection URL
pgvector_url = os.getenv('PGVECTOR_URL')

if not pgvector_url:
    print("Warning: PGVECTOR_URL not found in environment variables")
    print("Please add PGVECTOR_URL=postgresql://user:password@localhost:5432/dbname to your .env file")
else:
    print(f"PGVector URL configured: {pgvector_url.split('@')[0]}@***")

PGVector URL configured: postgresql+psycopg2://langchain:langchain@***


In [5]:
# Import required libraries for RAG indexing
from langchain_community.indexes import *
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

In [None]:
# Initialize embeddings model
embeddings = OpenAIEmbeddings(
    model='text-embedding-3-small',
    openai_api_key=os.getenv('OPENAI_API_KEY')
)

# Define collection name for the vector store
COLLECTION_NAME = "rag_documents"
namespace = f"pgvector/{COLLECTION_NAME}"

print(f"Initializing embeddings model: text-embedding-3-small")
print(f"Collection name: {COLLECTION_NAME}")
print(f"Namespace: {namespace}")

In [None]:
# Create PGVector store instance
try:
    vectorstore = PGVector(
        embeddings=embeddings,
        collection_name=COLLECTION_NAME,
        connection=pgvector_url,
        use_jsonb=True,
    )
    print("✅ PGVector store created successfully")
    
    # Test connection
    print(f"Vector store collection: {vectorstore.collection_name}")
    
except Exception as e:
    print(f"❌ Error creating PGVector store: {e}")
    print("Make sure PostgreSQL with pgvector extension is running and accessible")

In [None]:
# Create SQLRecordManager instance for tracking indexed documents
try:
    record_manager = SQLRecordManager(
        namespace=namespace,
        db_url=pgvector_url
    )
    
    # Create the schema for record manager (run this once)
    record_manager.create_schema()
    print("✅ SQLRecordManager created successfully")
    print(f"Record manager namespace: {record_manager.namespace}")
    
except Exception as e:
    print(f"❌ Error creating SQLRecordManager: {e}")
    print("This might be normal if the schema already exists")

In [None]:
# Create sample documents for indexing
sample_documents = [
    Document(
        page_content="""
        Artificial Intelligence (AI) is a broad field of computer science focused on creating 
        intelligent machines that can perform tasks typically requiring human intelligence. 
        AI encompasses various subfields including machine learning, natural language processing, 
        computer vision, and robotics. Modern AI systems use neural networks and deep learning 
        to achieve remarkable performance in tasks like image recognition, language translation, 
        and decision-making.
        """,
        metadata={
            "source": "ai_overview.txt",
            "topic": "artificial_intelligence",
            "category": "technology",
            "author": "AI Research Team",
            "date": "2024-01-15"
        }
    ),
    
    Document(
        page_content="""
        Machine Learning is a subset of artificial intelligence that focuses on algorithms 
        and statistical models that enable computer systems to improve their performance 
        on a specific task through experience. There are three main types of machine learning: 
        supervised learning (learning with labeled data), unsupervised learning (finding 
        patterns in unlabeled data), and reinforcement learning (learning through interaction 
        with an environment). Popular algorithms include linear regression, decision trees, 
        neural networks, and support vector machines.
        """,
        metadata={
            "source": "ml_basics.txt",
            "topic": "machine_learning",
            "category": "technology",
            "author": "Data Science Team",
            "date": "2024-01-20"
        }
    ),
    
    Document(
        page_content="""
        Natural Language Processing (NLP) is a field of AI that focuses on the interaction 
        between computers and human language. NLP combines computational linguistics with 
        machine learning and deep learning to help computers understand, interpret, and 
        generate human language in a valuable way. Common NLP tasks include sentiment analysis, 
        named entity recognition, machine translation, text summarization, and question answering. 
        Modern NLP heavily relies on transformer architectures like BERT, GPT, and T5.
        """,
        metadata={
            "source": "nlp_guide.txt",
            "topic": "natural_language_processing",
            "category": "technology",
            "author": "NLP Research Lab",
            "date": "2024-01-25"
        }
    ),
    
    Document(
        page_content="""
        Vector databases are specialized databases designed to store and efficiently search 
        high-dimensional vector embeddings. They are essential for modern AI applications 
        like semantic search, recommendation systems, and retrieval-augmented generation (RAG). 
        Vector databases use advanced indexing techniques like HNSW (Hierarchical Navigable 
        Small World) or IVF (Inverted File) to enable fast similarity searches across millions 
        or billions of vectors. Popular vector databases include Pinecone, Weaviate, Qdrant, 
        and PostgreSQL with pgvector extension.
        """,
        metadata={
            "source": "vector_db_intro.txt",
            "topic": "vector_databases",
            "category": "database",
            "author": "Database Team",
            "date": "2024-02-01"
        }
    ),
    
    Document(
        page_content="""
        Retrieval-Augmented Generation (RAG) is a powerful technique that combines 
        information retrieval with text generation to create more accurate and contextual 
        responses. RAG works by first retrieving relevant documents from a knowledge base 
        using semantic search, then using these documents as context for a large language 
        model to generate responses. This approach helps reduce hallucinations and provides 
        more factual, up-to-date information. RAG systems typically consist of a vector 
        database for document storage, an embedding model for creating vector representations, 
        and a language model for generation.
        """,
        metadata={
            "source": "rag_explained.txt",
            "topic": "retrieval_augmented_generation",
            "category": "ai_techniques",
            "author": "AI Applications Team",
            "date": "2024-02-05"
        }
    )
]

print(f"Created {len(sample_documents)} sample documents for indexing")
print("Sample topics:", [doc.metadata["topic"] for doc in sample_documents])

In [None]:
# Optional: Split documents into smaller chunks for better retrieval
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

# Split the documents
split_documents = []
for doc in sample_documents:
    chunks = text_splitter.split_documents([doc])
    
    # Add chunk information to metadata
    for i, chunk in enumerate(chunks):
        chunk.metadata.update({
            "chunk_id": f"{doc.metadata['source']}_chunk_{i}",
            "chunk_index": i,
            "total_chunks": len(chunks)
        })
    
    split_documents.extend(chunks)

print(f"Original documents: {len(sample_documents)}")
print(f"After splitting: {len(split_documents)} chunks")
print(f"Average chunk size: {sum(len(doc.page_content) for doc in split_documents) // len(split_documents)} characters")

In [None]:
# Index documents using the record manager for deduplication and tracking
try:
    print("🚀 Starting document indexing...")
    
    # Use the index function with record manager for smart indexing
    # This will handle deduplication and track what's been indexed
    result = index(
        docs_source=split_documents,
        record_manager=record_manager,
        vector_store=vectorstore,
        cleanup="incremental",  # Only add new/changed documents
        source_id_key="chunk_id",  # Use chunk_id as unique identifier
    )
    
    print("✅ Document indexing completed!")
    print(f"Documents added: {result['num_added']}")
    print(f"Documents updated: {result['num_updated']}")
    print(f"Documents skipped: {result['num_skipped']}")
    print(f"Documents deleted: {result['num_deleted']}")
    
except Exception as e:
    print(f"❌ Error during indexing: {e}")
    print("Make sure PGVector and SQLRecordManager are properly configured")

In [None]:
# Test the vector store with a similarity search
try:
    query = "What is machine learning and how does it work?"
    print(f"🔍 Testing similarity search with query: '{query}'")
    
    # Perform similarity search
    results = vectorstore.similarity_search(
        query=query,
        k=3  # Return top 3 most similar documents
    )
    
    print(f"\n📋 Found {len(results)} similar documents:")
    
    for i, doc in enumerate(results, 1):
        print(f"\n--- Result {i} ---")
        print(f"Source: {doc.metadata.get('source', 'Unknown')}")
        print(f"Topic: {doc.metadata.get('topic', 'Unknown')}")
        print(f"Chunk ID: {doc.metadata.get('chunk_id', 'Unknown')}")
        print(f"Content preview: {doc.page_content[:200]}...")
        
except Exception as e:
    print(f"❌ Error during similarity search: {e}")

In [None]:
# Verify indexing status and get statistics
try:
    print("📊 Vector Store Statistics:")
    print("-" * 30)
    
    # Get total document count
    all_docs = vectorstore.similarity_search("", k=100)  # Get all docs (up to 100)
    print(f"Total indexed documents: {len(all_docs)}")
    
    # Show unique topics
    topics = set(doc.metadata.get('topic', 'unknown') for doc in all_docs)
    print(f"Unique topics: {len(topics)}")
    print(f"Topics: {', '.join(sorted(topics))}")
    
    # Show sources
    sources = set(doc.metadata.get('source', 'unknown') for doc in all_docs)
    print(f"Unique sources: {len(sources)}")
    print(f"Sources: {', '.join(sorted(sources))}")
    
    # Record manager status
    print(f"\nRecord Manager Namespace: {record_manager.namespace}")
    
    print("\n✅ RAG indexing setup complete!")
    print("You can now use the vector store for retrieval-augmented generation")
    
except Exception as e:
    print(f"❌ Error getting statistics: {e}")