# Recommendation System with Pinecone Vector Database

In [1]:
import os
import dotenv
import sys

# Add the current directory to path instead of parent
project_root = os.path.dirname(os.path.abspath("__file__"))
sys.path.append(project_root)

from src.config.pinecone_config import initialize_pinecone

# Load environment variables
dotenv.load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [None]:
# Initialize Pinecone
initialize_pinecone()

Using existing Pinecone index: recommendation-index


## 1. Load Document Data

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader

# List of URLs to load documents from
urls = [
    "https://lpi.oregonstate.edu/sites/lpi.oregonstate.edu/files/pdf/mic/micronutrients_for_health.pdf",
    "https://www.accessdata.fda.gov/scripts/InteractiveNutritionFactsLabel/assets/InteractiveNFL_Vitamins%26MineralsChart_October2021.pdf",
    "https://www.hilarispublisher.com/open-access/essential-nutrients-in-human-body.pdf",
]

# Load and split documents - this would be done only once
def load_documents():
    # Load documents from the URLs
    docs = [WebBaseLoader(url).load() for url in urls]
    docs_list = [item for sublist in docs for item in sublist]

    # Initialize a text splitter with specified chunk size and overlap
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=250, chunk_overlap=0
    )

    # Split the documents into chunks
    doc_splits = text_splitter.split_documents(docs_list)
    return doc_splits

## 2. Process Documents and Store in Pinecone

In [None]:
from src.services.recommendation_engine import RecommendationEngine


print("Processing and storing documents in Pinecone...")

# Create recommendation engine
recommendation_engine = RecommendationEngine()

# For this example, let's just process a small sample to demonstrate
docs = load_documents()[:50]  # Limit to 50 documents for example purposes
print(f"Documents loaded and processed: {len(docs)}")

# Extract content and metadata
contents = [doc.page_content for doc in docs]
metadatas = []

for doc in docs:
    metadata = dict(doc.metadata)
    # Add document type
    metadata["document_type"] = "nutrition_article"
    metadatas.append(metadata)

Processing and storing documents in Pinecone...
Documents loaded and processed: 50


In [5]:
# Add items to recommendation engine in smaller batches
batch_size = 10
for i in range(0, len(contents), batch_size):
    batch_contents = contents[i:i+batch_size]
    batch_metadatas = metadatas[i:i+batch_size]
    
    # This will use the cache for any existing embeddings
    ids = recommendation_engine.bulk_add_items(
        contents=batch_contents,
        metadatas=batch_metadatas,
        item_types=["nutrition_document"] * len(batch_contents)
    )

print("All documents stored in Pinecone!")

Generating new embedding for: Micronutrients for Health...
Generating new embedding for: Information about the amount of vitamins and minerals...
Generating new embedding for: Vitamins & Minerals Chart...
Using cached embedding for: Micronutrients for Health...
All documents stored in Pinecone!


## 3. Make Recommendations

In [6]:
# Get recommendations for a query
query = "What are the best sources of vitamin C?"
results = recommendation_engine.get_recommendations(query, top_k=3)

# Display results
print(f"Top recommendations for '{query}':\n")
for i, result in enumerate(results, 1):
    print(f"{i}. Score: {result['score']:.2f}")
    print(f"Content: {result['metadata']['content']}\n")

Generating new embedding for: What are the best sources of vitamin C?...
Top recommendations for 'What are the best sources of vitamin C?':

1. Score: 0.82
Content: Information about the amount of vitamins and minerals you should consume based on your personal calorie level. Information about the impact of vitamins and minerals on your health—and the impact your lifestyle may have on your nutritional needs.

2. Score: 0.78
Content: Micronutrients for Health

3. Score: 0.71
Content: Vitamins & Minerals Chart



## 4. Test Caching Functionality

In [7]:
# Use the same query again to demonstrate caching
results = recommendation_engine.get_recommendations(query, top_k=3)
print("Same query again demonstrates the caching mechanism!\n")
print("Retrieval times should be much faster now that embeddings are cached")

Using cached embedding for: What are the best sources of vitamin C?...
Same query again demonstrates the caching mechanism!

Retrieval times should be much faster now that embeddings are cached
