In [41]:
#pip install "pymongo[srv]"==3.12
#pip install --quiet pymongo gpt4all sentence_transformers
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pymongo


In [1]:
ATLAS_CONNECTION_STRING = ("connstringhere")


In [23]:
def pad_embedding(embedding, target_dim=3072):
    """Pads a 1024-dimensional vector to match 3072 dimensions with zeros."""
    if len(embedding) < target_dim:
        return embedding + [0.0] * (target_dim - len(embedding))
    elif len(embedding) > target_dim:
        return embedding[:target_dim]  # Truncate if somehow larger
    return embedding

def get_embedding(text):
    """Generates and pads embeddings to match the required dimensions."""
    embedding = model.encode(text).tolist()  # Generate the 1024-dimensional vector
    return pad_embedding(embedding, target_dim=3072)  # Pad to 3072 dimensions


In [36]:

# Connect to your local Atlas deployment or Atlas Cluster
client = MongoClient(ATLAS_CONNECTION_STRING)
collection = client["contentDeliveryApi"]["Article"]
# Load the embedding model (https://huggingface.co/sentence-transformers/mixedbread-ai/mxbai-embed-large-v1)
#model_path = "<model-path>"
#model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')
#model.save(model_path)
#model = SentenceTransformer(model_path)
# Define function to generate embeddings
def get_query_results(query):
    # Generate embedding for the query
    query_embedding = get_embedding(query)
    
    # Perform vector search using the query embedding
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",  # Assuming you have a vector index set up
                "queryVector": query_embedding,
                "path": "jsonBody.content_embedding",  # Path to where the embeddings are stored in your document
                "exact": True,
                "limit": 5  # Adjust this to how many results you want
            }
        },
        {
            "$project": {
                "_id": 0,
                "headline": 1,
                "urlSlug": 1,
                "score": {"$meta": "vectorSearchScore"}  # Return the score for relevance
            }
        }
    ]
    
    # Execute the aggregation pipeline
    results = collection.aggregate(pipeline)
    
    # Collect results into a list
    array_of_results = []
    for doc in results:
        array_of_results.append(doc)
    
    return array_of_results


In [32]:
sample_doc = collection.find_one()
print(sample_doc)  # Check the structure of the document


{'_id': '700000683', 'cmsId': '6QLF53JO6JA5LFCQ52CWLHU7JM', 'site': 'DEFAULT', 'subtype': 'DEFAULT', 'paywallStatus': 'DEFAULT', 'redirectUrl': None, 'urlSlug': 'static-images', 'headline': 'Swinging Bridge at Jay Cooke State Park', 'searchHeadline': 'Swinging Bridge at Jay Cooke State Park', 'homepageHeadline': 'Swinging Bridge at Jay Cooke State Park', 'socialHeadline': 'Swinging Bridge at Jay Cooke State Park', 'dek': 'This is a TEST Story with Inline Images', 'searchDek': 'This is a TEST Story with Inline Images', 'jsonBody': [{'__typename': 'BodyParagraph', 'content': 'There have been five versions of the heavily tramped Swinging Bridge at Jay Cooke State Park in Carlton, but there is one number that’s relevant this summer: 100.', 'orientation': 'LEFT', 'html': '<p id="UMGNHV2Q5FGRPD6K3LCF2MQOZI" class="text-left">There have been five versions of the heavily tramped Swinging Bridge at Jay Cooke State Park in Carlton, but there is one number that’s relevant this summer: 100.</p>'},

In [38]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pymongo

In [69]:

from pymongo import MongoClient
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pymongo

# Load the embedding model
model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')

# Function to generate embeddings
def get_embedding(text):
    # Ensure the embedding is a 2D array for the cosine_similarity function
    return [model.encode(text)]  # Wrap the embedding in a list to ensure it is 2D

# Initialize MongoDB connection
client = pymongo.MongoClient(ATLAS_CONNECTION_STRING)
collection = client["contentDeliveryApi"]["Article"]

# Example query
query = "bridge"
query_embedding = get_embedding(query)

# Use MongoDB Atlas Search to query for relevant documents (using $search stage)
pipeline = [
    {
        "$search": {
            "index": "bridex",  # Replace with your actual index name
            "text": {
                "query": query,
                "path": "headline",  # Assuming the content is in the 'jsonBody.content' field
                #"path": ["jsonBody.content", "searchHeadline"],  # Assuming the content is in the 'jsonBody.content' field
                "fuzzy": {
                    "maxEdits": 2
                }
            }
        }
    },
    {
        "$limit": 50  # Limit to the first 50 results
    }
]

# Execute the aggregation pipeline to get relevant documents
documents = list(collection.aggregate(pipeline))

# Function to calculate similarity between query embedding and document content
def calculate_similarity(query_embedding, document_content):
    doc_embedding = get_embedding(document_content)  # Generate embedding for the document content
    similarity_score = cosine_similarity(query_embedding, doc_embedding)[0][0]
    return similarity_score

# Find the most similar documents based on the query
similar_documents = []

for doc in documents:
    if 'jsonBody' in doc and isinstance(doc['jsonBody'], list):
        # Assuming that content is in the "jsonBody" field
        for paragraph in doc['jsonBody']:
            if 'content' in paragraph:
                content = paragraph['content']
                similarity = calculate_similarity(query_embedding, content)
                similar_documents.append((doc['_id'], content, similarity))

# Sort the documents by similarity score
similar_documents.sort(key=lambda x: x[2], reverse=True)

# Create a collection to hold the top 5 results
col = []
for doc in similar_documents[:5]:
    col.append(doc)  # Add each document to the list

# Print the top 5 similar documents
for doc in col:
    print(f"Document ID: {doc[0]}, Content: {doc[1]}, Similarity Score: {doc[2]}")

Document ID: 700000683, Content: Envision Jay Cooke and the mind’s eye goes to standing on the iconic suspension bridge over the churning, and at times mesmerizing, St. Louis River and its gorge. First built in 1924 by the U.S. Forest Service, the bridge is marking its 100 year with special programs that capture its rise, its fall (more than once), and rebirth. It’s that longevity and resilience that make stepping on it special, said Kris Hiller, the park’s interpretive naturalist., Similarity Score: 0.5928773283958435
Document ID: 700000683, Content: Those first users in 1924 were on a saggy, wooden boardwalk, 18 feet above the river. (Today, the bridge sits about 25 feet above the river.), Similarity Score: 0.5780940055847168
Document ID: 700000683, Content: “When you are walking across the bridge, you are connecting to other generations of visitors who have been walking across this bridge for 100 years,” she said. “I think it just is very cool to think about that.”, Similarity Score

In [68]:
col

[]