In [1]:
import pymongo
import logging
import os
import torch
from datetime import datetime
import time
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack import Pipeline, Document
from haystack.document_stores.types import DuplicatePolicy
from haystack.components.writers import DocumentWriter
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore
from haystack_integrations.components.retrievers.mongodb_atlas import MongoDBAtlasEmbeddingRetriever
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from pymongo import MongoClient, errors
from typing import List, Dict, Any
from getpass import getpass
from haystack.components.generators import OpenAIGenerator
from haystack.document_stores.types import DuplicatePolicy
import traceback

In [25]:
os.environ['MONGO_CONNECTION_STRING'] = ''
os.environ['OPENAI_API_KEY'] = ''

In [1]:
from bs4 import BeautifulSoup
import re
from typing import List, Dict, Union
import logging
from datetime import datetime
from pymongo import MongoClient, errors
import time
import traceback
import os

logger = logging.getLogger(__name__)

def clean_html_content(text: str) -> str:
    """
    Clean HTML and unnecessary formatting from text content.
    
    Args:
        text (str): Raw HTML text to clean
        
    Returns:
        str: Cleaned text with HTML removed and formatting standardized
    """
    if not text:
        return ""
        
    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text(separator=' ')
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s.,!?-]', '', text)
    
    return text.strip()

def extract_body_paragraphs(json_body: List[Dict]) -> str:
    """
    Extract and combine content from body paragraphs in the JSON body.
    Includes HTML cleaning for paragraph content.
    
    Args:
        json_body: List of JSON body objects
    Returns:
        Combined string of all cleaned paragraph contents
    """
    if not isinstance(json_body, list):
        logger.warning(f"Expected list, got {type(json_body)}")
        return ""
    
    paragraphs = []
    for item in json_body:
        if (isinstance(item, dict) and 
            item.get('__typename') == 'BodyParagraph' and 
            item.get('content')):
            # Clean the HTML content before adding to paragraphs
            cleaned_content = clean_html_content(item['content'])
            if cleaned_content:
                paragraphs.append(cleaned_content)
    
    # Join all paragraphs with space
    return " ".join(paragraphs)

def extract_metadata(doc: Dict) -> Dict:
    """
    Extract required metadata fields from the document.
    Includes HTML cleaning for text fields.
    
    Args:
        doc: MongoDB document
    Returns:
        Dictionary containing cleaned metadata fields
    """
    return {
        'original_id': str(doc['_id']),
        'headline': clean_html_content(doc.get('headline', '')),
        'dek': clean_html_content(doc.get('dek', '')),
        'searchDek': clean_html_content(doc.get('searchDek', ''))
    }

def vectorize_mongodb_documents(
    embedding_model: str = "intfloat/e5-base-v2",
    database_name: str = "contentDeliveryApi",
    collection_name: str = "Article",
    max_documents: int = None
) -> Dict[str, Dict]:
    """
    Retrieve documents from MongoDB, combine all paragraphs for each document,
    clean HTML content, and create vector embeddings stored in a dictionary, 
    including metadata.
    
    Args:
        embedding_model: Name of the embedding model to use
        database_name: Name of the MongoDB database
        collection_name: Name of the MongoDB collection
        max_documents: Maximum number of documents to process (None for all)
    
    Returns:
        Dict[str, Dict]: Dictionary with document IDs as keys and values containing:
            - content: str (combined document text)
            - embedding: List[float] (vector embedding)
            - meta: Dict (additional metadata)
    """
    start_time = time.time()
    embeddings_dict = {}

    # Get MongoDB connection string from environment variable
    mongodb_uri = os.getenv('MONGO_CONNECTION_STRING')
    if not mongodb_uri:
        raise ValueError("MONGO_CONNECTION_STRING environment variable is not set")
    
    try:
        # Connect to MongoDB
        client = MongoClient(mongodb_uri)
        db = client[database_name]
        collection = db[collection_name]

        # Query filter
        query_filter = {
            "displayDate": {"$gte": datetime(2024, 1, 1), "$lte": datetime(2024, 12, 31)},
            "distributor.type": "staff",
            "jsonBody": {"$exists": True, "$ne": []}
        }

        # Projection to retrieve only necessary fields
        projection = {
            "_id": 1,
            "jsonBody": 1,
            "headline": 1,
            "searchDek": 1,
            "dek": 1
        }

        # Count matching documents
        total_docs = collection.count_documents(query_filter)
        logger.info(f"Total matching documents: {total_docs}")

        # Adjust total documents based on max_documents
        if max_documents and max_documents < total_docs:
            total_docs = max_documents

        # Initialize embedder
        doc_embedder = SentenceTransformersDocumentEmbedder(model=embedding_model)
        doc_embedder.warm_up()

        # Process documents in batches
        documents = []
        processed_docs = 0
        error_docs = 0
        
        cursor = collection.find(query_filter, projection).batch_size(100)
        if max_documents:
            cursor = cursor.limit(max_documents)

        # Start processing
        for doc in cursor:
            try:
                combined_text = extract_body_paragraphs(doc.get("jsonBody", []))
                
                if combined_text:  # Skip if no content
                    documents.append(Document(
                        content=combined_text,
                        meta=extract_metadata(doc)
                    ))
                
                processed_docs += 1
                
                # Batch process every 1,000 documents
                if len(documents) >= 1000:
                    docs_with_embeddings = doc_embedder.run(documents)
                    for doc_with_embedding in docs_with_embeddings["documents"]:
                        embeddings_dict[doc_with_embedding.meta["original_id"]] = {
                            "content": doc_with_embedding.content,
                            "embedding": doc_with_embedding.embedding,
                            "meta": doc_with_embedding.meta
                        }
                    documents = []  # Clear the batch

            except Exception as doc_error:
                error_docs += 1
                logger.error(f"Error processing document {doc.get('_id')}: {doc_error}")
                logger.error(traceback.format_exc())

            # Break early if max_documents reached
            if max_documents and processed_docs >= max_documents:
                break

            # Log progress every 1,000 documents
            if processed_docs % 1000 == 0:
                elapsed_time = time.time() - start_time
                logger.info(f"Processed {processed_docs}/{total_docs} documents in {elapsed_time:.2f} seconds")

        # Process remaining documents
        if documents:
            docs_with_embeddings = doc_embedder.run(documents)
            for doc_with_embedding in docs_with_embeddings["documents"]:
                embeddings_dict[doc_with_embedding.meta["original_id"]] = {
                    "content": doc_with_embedding.content,
                    "embedding": doc_with_embedding.embedding,
                    "meta": doc_with_embedding.meta
                }

        total_time = time.time() - start_time
        logger.info(f"Completed processing in {total_time / 60:.2f} minutes")
        logger.info(f"Total docs processed: {processed_docs}, Errors: {error_docs}")
        
        return embeddings_dict

    except errors.ConnectionFailure as e:
        logger.error(f"Failed to connect to MongoDB: {e}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        logger.error(traceback.format_exc())
        raise

In [3]:
#embeddings_dict = {}
embeddings_dict= vectorize_mongodb_documents()

ValueError: MONGO_CONNECTION_STRING environment variable is not set

In [17]:
import pandas as pd

In [19]:
pd.DataFrame.from_dict(embeddings_dict, orient='index',
                       columns=['content', 'embedding', 'meta'])

Unnamed: 0,content,embedding,meta
