In [4]:
import json
import os
from typing import List
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from langchain_ollama import OllamaEmbeddings
from langchain_qdrant import QdrantVectorStore
from langchain_core.documents import Document
from tqdm import tqdm
import uuid

In [5]:
def load_book_chunks(file_path: str) -> List[dict]:
    """Load chunks from the JSON file."""
    print(f"📚 Loading chunks from {file_path}")
    
    with open(file_path, 'r', encoding='utf-8') as f:
        chunks = json.load(f)
    
    print(f"✅ {len(chunks)} chunks loaded")
    return chunks

def convert_to_documents(chunks: List[dict]) -> List[Document]:
    """Convert JSON chunks to LangChain Documents."""
    print("🔄 Converting to LangChain Documents...")
    
    documents = []
    for chunk in tqdm(chunks, desc="Conversion"):
        # Create the Document with content and metadata
        doc = Document(
            page_content=chunk["page_content"],
            metadata=chunk["metadata"]
        )
        documents.append(doc)
    
    print(f"✅ {len(documents)} documents created")
    return documents

def setup_qdrant_with_books(
    qdrant_url: str = "http://localhost:6333",
    collection_name: str = "puppy_books",
    embedding_model: str = "mxbai-embed-large",
    ollama_base_url: str = "http://localhost:11434"
):
    """Set up QDrant and embeddings."""
    
    print("🔧 Setting up QDrant and embeddings...")
    
    # QDrant Client
    client = QdrantClient(url=qdrant_url)
    
    # Embeddings via Ollama
    embeddings = OllamaEmbeddings(
        base_url=ollama_base_url,
        model=embedding_model
    )
    
    # Test embeddings
    print("🧪 Testing embeddings...")
    test_embedding = embeddings.embed_query("test embedding")
    embedding_dim = len(test_embedding)
    print(f"✅ Embeddings OK (dimension: {embedding_dim})")
    
    # Check if collection exists, if not create it
    try:
        client.get_collection(collection_name)
        print(f"📝 Collection '{collection_name}' already exists")
    except Exception:
        print(f"📝 Creating collection '{collection_name}'...")
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE)
        )
        print(f"✅ Collection '{collection_name}' created")
    
    # VectorStore
    vectorstore = QdrantVectorStore(
        client=client,
        collection_name=collection_name,
        embedding=embeddings
    )
    
    return vectorstore, client, embedding_dim

def populate_qdrant(
    file_path: str,
    qdrant_url: str = "http://localhost:6333",
    collection_name: str = "puppy_books",
    embedding_model: str = "mxbai-embed-large",
    ollama_base_url: str = "http://localhost:11434",
    batch_size: int = 50  # Process in batches to avoid timeouts
):
    """Populate QDrant with book chunks."""
    
    print("🐕 === POPULATING QDRANT WITH PUPPY BOOKS ===\n")
    
    # 1. Load data
    chunks = load_book_chunks(file_path)
    
    # 2. Convert to Documents
    documents = convert_to_documents(chunks)
    
    # 3. Initial client for collection management
    client = QdrantClient(url=qdrant_url)
    
    # 4. Check if the collection already exists and delete if needed
    try:
        collection_info = client.get_collection(collection_name)
        print(f"⚠️  Collection '{collection_name}' already exists with {collection_info.points_count} points")
        
        response = input("Do you want to delete and recreate it? (y/N): ")
        if response.lower() == 'y':
            client.delete_collection(collection_name)
            print("🗑️  Collection deleted")
        else:
            print("➕ Adding new documents to the existing collection")
    except Exception:
        print(f"📝 Collection '{collection_name}' doesn't exist yet")
    
    # 5. Setup QDrant after potential deletion
    vectorstore, client, embedding_dim = setup_qdrant_with_books(
        qdrant_url, collection_name, embedding_model, ollama_base_url
    )
    
    # 6. Add documents in batches
    print(f"\n📥 Adding {len(documents)} documents to QDrant...")
    print(f"💡 Processing in batches of {batch_size} documents")
    
    total_added = 0
    for i in tqdm(range(0, len(documents), batch_size), desc="Batch adding"):
        batch = documents[i:i + batch_size]
        
        try:
            vectorstore.add_documents(batch)
            total_added += len(batch)
            
            # Progress display
            if (i // batch_size + 1) % 5 == 0:  # Every 5 batches
                print(f"   📊 {total_added}/{len(documents)} documents added")
                
        except Exception as e:
            print(f"❌ Error while adding batch {i//batch_size + 1}: {e}")
            continue
    
    # 7. Final check
    try:
        collection_info = client.get_collection(collection_name)
        print(f"\n✅ Population complete!")
        print(f"📊 Collection '{collection_name}': {collection_info.points_count} points")
        print(f"📏 Vector dimension: {embedding_dim}")
        
        # Search test
        print("\n🔍 Search test...")
        test_results = vectorstore.similarity_search(
            "how to choose a puppy", 
            k=3
        )
        
        print(f"✅ Search test OK: {len(test_results)} results found")
        for i, doc in enumerate(test_results[:2]):
            print(f"   📄 Result {i+1}: {doc.page_content[:100]}...")
            print(f"      📚 Source: {doc.metadata.get('book_title', 'N/A')}")
        
        return True
        
    except Exception as e:
        print(f"❌ Error during verification: {e}")
        return False

def search_books(
    query: str,
    collection_name: str = "puppy_books",
    k: int = 5,
    qdrant_url: str = "http://localhost:6333",
    embedding_model: str = "mxbai-embed-large",
    ollama_base_url: str = "http://localhost:11434"
):
    """Utility function to test searches."""
    
    # Setup
    client = QdrantClient(url=qdrant_url)
    embeddings = OllamaEmbeddings(
        base_url=ollama_base_url,
        model=embedding_model
    )
    vectorstore = QdrantVectorStore(
        client=client,
        collection_name=collection_name,
        embedding=embeddings
    )
    
    # Search
    results = vectorstore.similarity_search_with_score(query, k=k)
    
    print(f"\n🔍 Search: '{query}'")
    print(f"📊 {len(results)} results found\n")
    
    for i, (doc, score) in enumerate(results, 1):
        print(f"📄 Result {i} (Score: {score:.3f})")
        print(f"   📚 Book: {doc.metadata.get('book_title', 'N/A')}")
        print(f"   📄 Page: {doc.metadata.get('page', 'N/A')}")
        print(f"   📝 Content: {doc.page_content[:200]}...")
        print()

In [None]:
# Configuration
FILE_PATH = "all_books_preprocessed_chunks.json"  # Adjust the path if needed

# Check if the file exists
if not os.path.exists(FILE_PATH):
    print(f" File not found: {FILE_PATH}")


# Populate QDrant
populate_qdrant(FILE_PATH)



In [6]:
query = input("Type your question about puppies: ")
search_books(query)


🔍 Search: ''
5 results found

Result 1 (Score: 0.479)
Book: Puppies For Dummies
Page: 30
Content: In­Chapter 6,­I­also­focus­on­describing­your­puppy’s­daily­needs­and­how­to­ structure­a­schedule­around­them.­Knowing­how­your­puppy­likes­to­organize­ their­day­takes­the­guesswork­out­of­this­expe...

Result 2 (Score: 0.462)
Book: Don't Shoot the Dog
Page: 113
Content: when compliance is obtained; reinforce even halfhearted efforts at ﬁrst.) Shine a strong light on doghouse when dog barks. Turn the light off when the dog stops barking. When the decibel level meets t...

Result 3 (Score: 0.460)
Book: Don't Shoot the Dog
Page: 127
Content: Kids too noisy in the car. Wait for a quiet time and then say "You all

have been so quiet today that I'm going to stop at McDonald's." (Say this right near

Surly bus driver is rude to you and makes ...

Result 4 (Score: 0.458)
Book: Don't Shoot the Dog
Page: 131
Content: messy person could shape the tidy one to be more casual. Barking dogs are lonel