In [None]:
import os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from llama_index import SimpleDirectoryReader

In [None]:
import dotenv
dotenv.load_dotenv()

In [None]:
class PDFToSentenceEmbedding():

    def __init__(self):
        self.model = SentenceTransformer('BAAI/bge-small-en-v1.5')

    def load_document(self, file_path):
        documents = SimpleDirectoryReader(
            input_files=[file_path]
        ).load_data()

        return documents

    def generate_embedding(self, file_path):
        documents = self.load_document(file_path)
        texts = [doc.text for doc in documents]
        embeddings = self.model.encode(texts, normalize_embeddings=True)
        document_meta_list = [{"fileName": doc.metadata['file_name'],
                               "textIdx": idx,
                               "pageLabel": doc.metadata['page_label'],
                               "text": doc.text,
                               "embedding": embeddings[idx].tolist(),
                               } for idx, doc in enumerate(documents)]
        return document_meta_list

    def __call__(self, file_path):
        document_meta_list = self.generate_embedding(file_path)
        return document_meta_list

In [None]:
embedding_generator = PDFToSentenceEmbedding()

In [None]:
doc_meta_list = embedding_generator("./eBook-How-to-Build-a-Career-in-AI.pdf")

In [None]:
question = "What are steps to take when finding projects to build your experience?"

In [None]:
question_embedding = embedding_generator.model.encode(
    question, normalize_embeddings=True)

In [None]:
df_tmp = pd.DataFrame(doc_meta_list)

In [None]:
df_tmp['question'] = [question_embedding for _ in range(len(df_tmp))]

In [None]:
df_tmp['similarity'] = df_tmp.apply(lambda x: np.dot(
    np.array(x['embedding']), np.array(x['question'])), axis=1)

In [None]:
df_tmp = df_tmp.sort_values(by="similarity", ascending=False)

In [None]:
df_tmp['text'].iloc[:5].tolist()

In [None]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

In [None]:
MONGODB_URL = os.getenv("MONGODB_URL")

In [None]:

# Create a new client and connect to the server
mongo_client = MongoClient(MONGODB_URL, server_api=ServerApi('1'))

In [None]:
# database
db = mongo_client["RAG"]

In [None]:
collection = db["Document"]

In [None]:
results = collection.find_one(
    {"fileName": "eBook-How-to-Build-a-Career-in-AI.pdf"})

In [None]:
results

In [None]:
collection.insert_one({'fileName': "eBook-How-to-Build-a-Career-in-AI.pdf"})

In [None]:
# try:
#     mongo_client.admin.command('ping')
#     print("Pinged your deployment. You successfully connected to MongoDB!")
# except Exception as e:
#     print(e)