In [1]:
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL = "gpt-3.5-turbo"

MongDB connection

In [2]:
from pymongo.mongo_client import MongoClient
import os

# Imposta le credenziali come variabili d'ambiente
username = os.getenv('MONGO_CLUSTER_USER')
password = os.getenv('MONGO_CLUSTER_PASS')
cluster_url = 'la19.fjkkeei.mongodb.net'

uri = f"mongodb+srv://{username}:{password}@{cluster_url}/?retryWrites=true&w=majority&appName=LA19"

# Crea un nuovo client e connettiti al server
mongo_client = MongoClient(uri)

# Invia un ping per confermare una connessione riuscita
try:
    mongo_client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


## Step 5: Vector Database Setup and Data Ingestion

MongoDB acts as both an operational and a vector database. It offers a database solution that efficiently stores, queries and retrieves vector embeddings—the advantages of this lie in the simplicity of database maintenance, management and cost.

**To create a new MongoDB database, set up a database cluster:**

1. Head over to MongoDB official site and register for a [free MongoDB Atlas account](https://www.mongodb.com/cloud/atlas/register), or for existing users, [sign into MongoDB Atlas](https://account.mongodb.com/account/login?nds=true).

2. Select the 'Database' option on the left-hand pane, which will navigate to the Database Deployment page, where there is a deployment specification of any existing cluster. Create a new database cluster by clicking on the "+Create" button.

3.   Select all the applicable configurations for the database cluster. Once all the configuration options are selected, click the “Create Cluster” button to deploy the newly created cluster. MongoDB also enables the creation of free clusters on the “Shared Tab”.

 *Note: Don’t forget to whitelist the IP for the Python host or 0.0.0.0/0 for any IP when creating proof of concepts.*

4. After successfully creating and deploying the cluster, the cluster becomes accessible on the ‘Database Deployment’ page.

5. Click on the “Connect” button of the cluster to view the option to set up a connection to the cluster via various language drivers.

6. This tutorial only requires the cluster's URI(unique resource identifier). Grab the URI and copy it into the Google Colabs Secrets environment in a variable named `MONGO_URI` or place it in a .env file or equivalent.




In [3]:
# Ingest data into MongoDB
db = mongo_client['cv-bot']
collection = db['my_documents']

In [5]:
# Delete any existing records in the collection
collection.delete_many({})

DeleteResult({'n': 1, 'electionId': ObjectId('7fffffff0000000000000267'), 'opTime': {'ts': Timestamp(1717335206, 59), 't': 615}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1717335206, 59), 'signature': {'hash': b'\xe1\n\xc8\x18\xafD!\xd6\xf9\xfcW\x9b\xa4"f\x00!?\xfd\x82', 'keyId': 7320377173091549194}}, 'operationTime': Timestamp(1717335206, 59)}, acknowledged=True)

### Embeddings

In [11]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
import os

doc_folder = r'C:\Users\leoac\OneDrive\Work\CV\cv-bot\docs'

# Initialize the list to hold the documents
documents_list = []

for counter, filename in enumerate(os.listdir(doc_folder), start=1):
    file_path = os.path.join(doc_folder, filename)
    loader = TextLoader(file_path, encoding='utf-8')
    data = loader.load()

    # Extract title
    document_title = filename.split('.')[0]
    
    # Split the document into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, add_start_index=True)
    all_splits = text_splitter.split_documents(data)
    
    # Populate the list with documents containing chunks and embeddings
    for index, chunk in enumerate(all_splits, start=0):
        # Generate embeddings for the chunk
        chunk_text = chunk.page_content
        chunk_embeddings = embeddings.embed_query(chunk_text)
        
        # Create a document dictionary
        document = {
            'doc_title': document_title,
            'text': chunk_text,
            'embeddings': chunk_embeddings
        }
        
        # Add the document to the list
        documents_list.append(document)

[{'doc_title': 'cv', 'text': "Leonardo Acquaroli is a brilliant Data Scientist with a solid academic background and relevant experience in analytics and entrepreneurship. He is currently studying Data Science for Economics at the University of Milan, where he has achieved a GPA of 29.8 and founded the student group UnimAI. Previously, he completed his degree in Economics and Commerce at the Marche Polytechnic University with top honors (110L), also serving as the vice president of the Starting Finance Club Polimarche. His experimental thesis explored the impact of financial education through his startup B come Bill.\n\nLeonardo works as an intern at Soccerment, one of the most advanced football analytics companies in Europe, where he has contributed to the creation of new features for AIDA, a chatbot powered by LLM and RAG. His master's thesis focuses on automating football data collection using Computer Vision techniques.", 'embeddings': [-0.0003818571228504785, 0.03676246805479104, -

In [16]:
collection.insert_many(documents_list)

print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


<h2> Test similarity search</h2>

In [45]:
query = "Leonardo's skills"

In [46]:
#test similarity search starting from vectore
query_embeddings = embeddings.embed_query(query) #--> in this case i have created a vector from the query

## Step 6: Create a Vector Search Index

At this point make sure that your vector index is created via MongoDB Atlas.
Follow instructions here:

This next step is mandatory for conducting efficient and accurate vector-based searches based on the vector embeddings stored within the documents in the ‘movie_collection’ collection. Creating a Vector Search Index enables the ability to traverse the documents efficiently to retrieve documents with embeddings that match the query embedding based on vector similarity. Go here to read more about [MongoDB Vector Search Index](https://www.mongodb.com/docs/atlas/atlas-search/field-types/knn-vector/).


## Step 7: Perform Vector Search on User Queries

This step combines all the activities in the previous step to provide the functionality of conducting vector search on stored records based on embedded user queries.

This step implements a function that returns a vector search result by generating a query embedding and defining a MongoDB aggregation pipeline. The pipeline, consisting of the `$vectorSearch` and `$project` stages, queries using the generated vector and formats the results to include only required information like plot, title, and genres while incorporating a search score for each result.

This selective projection enhances query performance by reducing data transfer and optimizes the use of network and memory resources, which is especially critical when handling large datasets. For AI Engineers and Developers considering data security at an early stage, the chances of sensitive data leaked to the client side can be minimized by carefully excluding fields irrelevant to the user's query.


In [12]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embeddings = embeddings.embed_query(user_query)

    if query_embeddings is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "textual_docs_vector_index",
                "queryVector": query_embeddings,
                "path": "embeddings",
                "numCandidates": 100,  # Number of candidate matches to consider
                "limit": 2  # Return top 5 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "doc_title": 1,  # Include the doc_title field
                "text": 1,  # Include the text field
                "embeddings": 1,
                "score": {
                    "$meta": "vectorSearchScore"  # Include the search score
                }
            }
        }
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

## Step 8: Handling User Query and Result

The final step in the implementation phase focuses on the practical application of our vector search functionality and AI integration to handle user queries effectively.

The handle_user_query function performs a vector search on the MongoDB collection based on the user's query and utilizes OpenAI's GPT-3.5 model to generate context-aware responses.


In [13]:
from langchain.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_core.messages import SystemMessage

def handle_user_query(query, collection):
    get_knowledge = vector_search(query, collection)

    search_result = ''
    for result in get_knowledge:
        search_result += f'''Document title: {result.get('doc_title', 'N/A')},
                             Document text: {result.get('text', 'N/A')}\n'''

    # Initialize the ChatOpenAI instance with your model and API key
    llm = ChatOpenAI(model=MODEL, temperature=0.0001, api_key=OPENAI_API_KEY)

    # Prepare the system prompt
    system_prompt = '''You are a hiring assistant built by Leonardo Acquaroli and you have to give recruiters information about Leonardo that can convince them to hire him without showing too much that you are biased towards him.
        Provide detailed answers mostly based on the passed context.'''

    # Prepare the user prompt with the query and search results
    user_prompt = f'''Answer this user query: "{query}", with the following context: "{search_result}"'''

    # Create the ChatPromptTemplate
    prompt_template = ChatPromptTemplate.from_messages(
        [
            SystemMessage(content=system_prompt),
            HumanMessagePromptTemplate.from_template(user_prompt)
        ]
    )
    
    # Prepare the input for the invoke method
    formatted_input = prompt_template.format(input={"query": query, "context": search_result})

    # Invoke the LLM with the formatted input
    response = llm.invoke(input=formatted_input)
    
    return response.content, search_result

In [14]:
# 6. Conduct query with retrival of sources
query = "Leonardo's three main skills"
response, source_information = handle_user_query(query, collection)

print(f"Response: {response}")


Response: Based on the information provided in the document titled "cv," Leonardo's three main skills can be identified as:

1. Strong passion for football data analysis: Leonardo has a deep interest in analyzing football data, which showcases his dedication and expertise in this specific field.

2. Advanced skills in Statistics & Machine Learning, Computer Vision, and Reinforcement Learning: Leonardo possesses advanced technical skills in key areas such as Statistics & Machine Learning, Computer Vision, and Reinforcement Learning, demonstrating his proficiency in these critical areas of data science.

3. Proficiency in Python, R, SQL, and Excel: Leonardo is proficient in a variety of programming languages and tools commonly used in data analysis, including Python, R, SQL, and Excel. This broad skill set allows him to effectively work with data and derive valuable insights.

Overall, Leonardo's combination of technical expertise, passion for football data analysis, and proficiency in k