In [1]:
import os
import time
from dotenv import load_dotenv
import pymongo
from llama_index.core.vector_stores.types import VectorStore
from typing import List
from sentence_transformers import SentenceTransformer
import numpy as np

#Data Loaders
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.github import GithubClient,GithubRepositoryReader
from llama_index.readers.file import PDFReader
from llama_index.core.node_parser import SentenceSplitter
#Indices and Storage
import pymongo
from llama_index.storage.kvstore.mongodb import MongoDBKVStore as MongoDBCache
from llama_index.storage.docstore.mongodb import MongoDocumentStore
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
#Pipeline
from llama_index.core.ingestion import IngestionPipeline, IngestionCache, DocstoreStrategy
#Vector Embedding Model
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

  from .autonotebook import tqdm as notebook_tqdm


In [5]:

class SentenceTransformerEmbeddings:
    
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model = SentenceTransformer(self.model_name)
    
    def get_embeddings(self, sentences):
        return self.model.encode(sentences).tolist()
    
class MongoVectorStore(VectorStore):
    def __init__(self, db_name, collection_name, host, port):
        self.mongo_client = pymongo.MongoClient(host, port)
        self.db = self.mongo_client[db_name]
        self.collection = self.db[collection_name]
        self.embedding_model = SentenceTransformerEmbeddings()

    def add(self, texts: List[str]):
        embeddings = self.embedding_model.get_embeddings(texts)
        for line, embedding in zip(texts, embeddings):
            self.collection.insert_one({"text": line, "embedding": embedding})

    def cosine_similarity(self, vec1, vec2):
        vec1 = np.array(vec1)
        vec2 = np.array(vec2)
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    def similarity_search(self, query: str, k: int):
        query_embedding = self.embedding_model.get_embeddings([query])[0]
        all_docs = self.collection.find({})
        
        similarities = []
        for doc in all_docs:
            embedding = doc['embedding']
            similarity = self.cosine_similarity(query_embedding, embedding)
            similarities.append((doc['text'], similarity))

        # Sort the results based on similarity in descending order
        similarities.sort(key=lambda x: x[1], reverse=True)

        # Return the top k most similar results
        return [text for text, similarity in similarities[:k]]

# Example usage
vector_store = MongoVectorStore("test", "test", "localhost", 27017)
vector_store.add(["This is a test", "Another test", "Another test 2"])

In [6]:

# Search for similar texts
results = vector_store.similarity_search("is", 2)
print(results)


['This is a test', 'This is a test']


In [None]:

print('Initialize')

load_dotenv(os.path.join(os.path.dirname(__file__), 'config', '.env'))

MONGODB_URL          = os.getenv('MONGODB_URL')
MONGODB_DBNAME       = os.getenv('MONGODB_DBNAME')
MONGODB_COLLECTION_NAME       = os.getenv('MONGODB_COLLECTION_NAME')
MONGODB_CLIENT       = pymongo.MongoClient(MONGODB_URL)
MONGODB_CACHE        = IngestionCache(cache=MongoDBCache(mongo_client=MONGODB_CLIENT, db_name = MONGODB_DBNAME))
MONGODB_DOCSTORE     = MongoDocumentStore.from_uri(uri=MONGODB_URL, db_name = MONGODB_DBNAME)
LOGS_DIR             = os.getenv('LOGS_DIR')
EMBED_MODEL          = 'BAAI/bge-small-en-v1.5'
EMBEDDINGS           = HuggingFaceEmbedding(model_name=EMBED_MODEL)
print('->Ingestion Data Sources:')
print('  LOGS_DIR(SimpleDirectoryReader)  = '+LOGS_DIR)
print('->Embedding Model:')
print('  HuggingFaceEmbedding','->',EMBED_MODEL)
print('->Storage:')
print('  MongoDB','->',MONGODB_DBNAME)

def ingest_logs():
  print('->Ingest Logs')
  start         = time.time()
  splitter      = SentenceSplitter(chunk_size=280, chunk_overlap=30)
  documents     = SimpleDirectoryReader(LOGS_DIR, filename_as_id = True).load_data()
  pipeline      = IngestionPipeline(
                        transformations   = [splitter,EMBEDDINGS],
                        vector_store      = MongoDBAtlasVectorSearch(
                                              mongodb_client  = MONGODB_CLIENT,
                                              db_name         = MONGODB_DBNAME,
                                              collection_name = MONGODB_COLLECTION_NAME,
                                              vector_index_name      = 'logs_idx'),
                        cache             = MONGODB_CACHE,
                        docstore          = MONGODB_DOCSTORE,
                        docstore_strategy = DocstoreStrategy.UPSERTS,
                  )
  nodes         = pipeline.run(documents = documents)
  end           = time.time()
  print(f'  Total Time = {end-start}', f'Total Documents = {len(documents)}', f'Total Nodes = {len(nodes)}')

ingest_logs()

print('Manually create atlas vector search index:','logs_idx')

In [None]:
  def add(self, texts: List[str]):
        
        embeddings = self.embedding_model.get_embeddings(texts)
        embeddings_np = np.array(embeddings).astype('float32')
        
        chunk_size = 600  
        documents = []
        for i in range(0, len(texts), chunk_size):
            chunk_texts = texts[i:i + chunk_size]
            chunk_embeddings = embeddings_np[i:i + chunk_size]

            for line, embedding in zip(chunk_texts, chunk_embeddings):
                embedding_list = embedding.tolist()
                documents.append({"text": line, "embedding": embedding_list})
        num_chunks = (len(embeddings_np) + chunk_size - 1) // chunk_size
        chunks = [embeddings_np[i * chunk_size:(i + 1) * chunk_size] for i in range(num_chunks)]
        return chunks
        if documents:
            self.collection.insert_many(documents, ordered=False)
            self.faiss_idx.add(embeddings_np)
