In [1]:
# ! pip install psycopg2-binary pgvector
# ! pip install pymongo
# ! python -m pip install "pymongo[srv]"

In [2]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from langchain.vectorstores import MongoDBAtlasVectorSearch
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from dotenv import load_dotenv
import os

load_dotenv()

True

In [3]:
data_source_path = "data/data_english_edited.txt"
with open(data_source_path, "r") as f:
    data = f.read()
splitted_text = data.split("$$")
print("Total section in data: ", len(splitted_text))

Total section in data:  24


In [4]:
document_list = []
tag_variable = None
for indx, chunk in enumerate(splitted_text):
    if indx % 2 == 0:
        tag_variable = chunk
        print("Current Tag data: ", tag_variable, end=" ")
    else:
        tmp_doc = Document(
            page_content=chunk,
            metadata={"source": data_source_path, "tag": tag_variable}
        )
        print(len(tmp_doc.page_content))
        document_list.append(tmp_doc)

print("Total document in data: ", len(document_list))

Current Tag data:  Introduction 670
Current Tag data:  Education 708
Current Tag data:  Projects 909
Current Tag data:  Internships 1045
Current Tag data:  Hard skills 1123
Current Tag data:  Soft skills 584
Current Tag data:  Awards and Recognitions 196
Current Tag data:  Languages 76
Current Tag data:  Certifications 251
Current Tag data:  Volunteering 164
Current Tag data:  Contacts 143
Current Tag data:  Activities and Hobbies 78
Total document in data:  12


In [5]:
embeddings = OpenAIEmbeddings()

In [6]:
uri = os.getenv("MONGO_URI")

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [7]:
DB_NAME = "jobFinder"
COLLECTION_NAME = "cv_mehdi"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_search_index"

MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]
type(MONGODB_COLLECTION)

pymongo.collection.Collection

In [8]:
# insert the documents in MongoDB Atlas with their embedding
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=document_list,
    embedding=OpenAIEmbeddings(disallowed_special=()),
    collection=MONGODB_COLLECTION,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

In [9]:
query = "Comment je peux contacter Mehdi ?"

vector_search.similarity_search_with_score(query, k=2)

[(Document(page_content=' - Phone: +33 7 52 07 57 72 - Email: mehdi.hayani_mechkouri@ensam.eu - LinkedIn: https://www.linkedin.com/in/mehdi-hayani-mechkouri-9ba3161bb/\n', metadata={'_id': ObjectId('6583185e743f9eb02c03f4a6'), 'embedding': [-0.03208892861485767, 0.005472988939288696, -0.001626949443086871, -0.03894823128315147, 0.002472043323418214, 0.018777674467943158, -0.026202016142345456, -0.011175935397034901, 0.010078709789344104, -0.004510452602219783, 0.02446747996387888, 0.009264002753384962, 0.017503051463746404, -0.024204670040895163, 0.0022420858049607053, -0.028330766273965018, 0.015242898233280176, -0.011011679893662025, -0.007805415131406095, 0.007995951161416045, -0.0183571804538144, -0.0031635582661864, -0.006997278576352195, -0.01391571510713187, -0.013495221093003114, -0.005443422706537704, 0.02455946297126188, -0.033455532763792234, 0.012903902025918833, -0.04488770760290561, 0.015650252216921044, -0.012956463079192982, -0.006691763787113491, -0.01800238752344768, 

In [10]:
query = "Formations et diplômes"

vector_search.similarity_search_with_score(query, k=2)

[(Document(page_content="\n1. Arts et Métiers – ParisTech: Grande École Program: Digital Engineering / Equivalent engineering degree BAC +5, Paris France campus, started in 2022 - expected completion in 2024\n2. École Nationale Supérieure d'Arts et Métiers, Meknes 2020-2022: Industrial Engineering cycle, Artificial Intelligence and Data Science option. Equivalent engineering degree BAC +5, Meknes Morocco campus. Started in 2020 - expected completion in 2024\n3. École Nationale Supérieure d'Arts et Métiers, Meknes 2018-2020: Integrated preparatory class, Equivalent to Bac +2. Meknes Morocco campus. Started in 2018 - finished in 2020\n4. Lycée Jeanne d'Arc Bac Science, with honors in Physics. Casablanca Morocco. Obtained in 2018\n", metadata={'_id': ObjectId('6583185e743f9eb02c03f49d'), 'embedding': [-0.004517510173373284, -0.015547995148006487, -0.007801712334631599, -0.031567142623678775, -0.018194755952167612, 0.02239354707680668, -0.02258755086686642, 0.003403722037364273, -0.0191647

In [12]:
query = "Internship"

vector_search.similarity_search_with_score(query, k=2)

[(Document(page_content='\n1. Full Stack Developer at GE Grid Solutions - Type: `Internship` between [June - September 2023], Role: [Intern], Title: [Web Developer]. Tasks and Achievements: Digitized the quality control process by creating an ETL system. Automated the data pipeline using RPA. Standardized the database (SQL) and developed a modular user interface. Created a dashboard for Failure Mode and Effects Analysis (DFMEA) using Power BI.\n2. Data Scientist at AI-Inside - Type: `Internship` between [August 2022 - October 2022], Role: [Intern], Title: [Data Scientist]. Tasks and Achievements: Developed and deployed an application using AI and data scraping to predict car sales prices and identify the most advantageous offers (Flask, MongoDB, TensorFlow, Selenium, Pandas).\n3. Industrial Engineer at Groupe Renault - Type: `Internship` between [August 2021 - October 2021], Role: [Intern], Title: [Industrial Engineer]. Tasks and Achievements: Actively collaborated on a production line