In [1]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-distilbert-cos-v1'
embedding_model = SentenceTransformer(model_name)

user_question = "I just discovered the course. Can I still join it?"
v = embedding_model.encode(user_question)
print(v[0])  # Print the first value of the resulting vector


0.078222655


In [2]:
v[0]


0.078222655

In [3]:
import requests
import numpy as np

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

# Filter documents for "machine-learning-zoomcamp"
ml_documents = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']

# Create embeddings
embeddings = []
for doc in ml_documents:
    qa_text = f"{doc['question']} {doc['text']}"
    embedding = embedding_model.encode(qa_text)
    embeddings.append(embedding)

X = np.array(embeddings)
print(X.shape)


(375, 768)


In [4]:
scores = X.dot(v)
print(scores.max())

0.6506573


In [5]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=ml_documents, embeddings=X)
search_results = search_engine.search(v, num_results=5)

import pandas as pd

relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'
df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

def hit_rate(search_engine, ground_truth, num_results=5):
    hits = 0
    for record in ground_truth:
        query = record['question']
        v_query = embedding_model.encode(query)
        results = search_engine.search(v_query, num_results=num_results)
        ids = [res['id'] for res in results]
        if record['document'] in ids:
            hits += 1
    return hits / len(ground_truth)

hitrate = hit_rate(search_engine, ground_truth, num_results=5)
print(hitrate)


0.9398907103825137


In [6]:
pip install tqdm


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [7]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200')

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

for doc in ml_documents:
    qa_text = f"{doc['question']} {doc['text']}"
    doc['question_vector'] = embedding_model.encode(qa_text)
    es_client.index(index=index_name, document=doc)

query_vector = embedding_model.encode(user_question)

search_query = {
    "size": 1,
    "query": {
        "script_score": {
            "query": {
                "match_all": {}
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'question_vector') + 1.0",
                "params": {
                    "query_vector": query_vector
                }
            }
        }
    }
}

response = es_client.search(index=index_name, body=search_query)
top_hit_id = response['hits']['hits'][0]['_source']['id']
print(top_hit_id)


BadRequestError: BadRequestError(400, 'mapper_parsing_exception', 'failed to parse')

In [9]:
from elasticsearch import Elasticsearch
import json

es_client = Elasticsearch('http://localhost:9200')

index_name = "documents"
dimension = 384  # Ensure this matches the embedding dimension

# Define the index mapping
mapping = {
    "mappings": {
        "properties": {
            "id": {"type": "keyword"},
            "question": {"type": "text"},
            "text": {"type": "text"},
            "question_vector": {
                "type": "dense_vector",
                "dims": dimension
            }
        }
    }
}

# Create the index
es_client.indices.create(index=index_name, body=mapping)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'documents'})

In [10]:
from sentence_transformers import SentenceTransformer
import requests

# Initialize the model
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

# Load documents
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

# Ensure only a subset of the documents
documents = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']

# Debugging: Print a document to check its structure
print("Sample document before indexing:", json.dumps(documents[0], indent=2))

# Index documents
for doc in documents:
    qa_text = f"{doc['question']} {doc['text']}"
    doc['question_vector'] = embedding_model.encode(qa_text).tolist()
    
    # Debugging: Print the document to check its structure before indexing
    print("Document to index:", json.dumps(doc, indent=2))
    
    es_client.index(index=index_name, document=doc)

# Create the query vector
user_question = "I just discovered the course. Can I still join it?"
query_vector = embedding_model.encode(user_question)

# Perform the search query
search_query = {
    "size": 1,
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'question_vector') + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    }
}

# Execute the search
response = es_client.search(index=index_name, body=search_query)
print("Search response:", response)


Sample document before indexing: {
  "text": "Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there\u2019s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork",
  "section": "General course-related questions",
  "question": "How do I sign up?",
  "course": "machine-learning-zoomcamp",
  "id": "0227b872"
}
Document to index: {
  "text": "Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there\u2019s a link. Here it is: h

BadRequestError: BadRequestError(400, 'mapper_parsing_exception', 'failed to parse')

In [11]:
response = es_client.search(index=index_name, body=search_query)
top_hit_id = response['hits']['hits'][0]['_source']['id']
print(top_hit_id)

NameError: name 'search_query' is not defined

In [13]:
from elasticsearch import Elasticsearch

# Initialize Elasticsearch client
es_client = Elasticsearch('http://localhost:9200')

index_name = "documents"
dimension = 384  # Ensure this matches the embedding dimension

# Define the index mapping
mapping = {
    "mappings": {
        "properties": {
            "id": {"type": "keyword"},
            "question": {"type": "text"},
            "text": {"type": "text"},
            "question_vector": {
                "type": "dense_vector",
                "dims": dimension
            }
        }
    }
}

# Create the index
es_client.indices.create(index=index_name, body=mapping)


BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [documents/CRxzX9rcSvaIPnBnCsNYMw] already exists')

In [14]:
# Create the query vector
user_question = "I just discovered the course. Can I still join it?"
query_vector = embedding_model.encode(user_question).tolist()

# Perform the search query
search_query = {
    "size": 1,
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'question_vector') + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    }
}

# Execute the search
response = es_client.search(index=index_name, body=search_query)

# Print the ID of the document with the highest score
highest_score_doc_id = response['hits']['hits'][0]['_id']
print("The ID of the document with the highest score is:", highest_score_doc_id)


IndexError: list index out of range

In [15]:
response

ObjectApiResponse({'took': 125, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 0, 'relation': 'eq'}, 'max_score': None, 'hits': []}})

In [16]:
response['hits']

{'total': {'value': 0, 'relation': 'eq'}, 'max_score': None, 'hits': []}

In [17]:
def es_hit_rate(ground_truth, num_results=5):
    hits = 0
    for record in ground_truth:
        query = record['question']
        query_vector = embedding_model.encode(query)

        search_query = {
            "size": num_results,
            "query": {
                "script_score": {
                    "query": {
                        "match_all": {}
                    },
                    "script": {
                        "source": "cosineSimilarity(params.query_vector, 'question_vector') + 1.0",
                        "params": {
                            "query_vector": query_vector
                        }
                    }
                }
            }
        }

        response = es_client.search(index=index_name, body=search_query)
        ids = [hit['_source']['id'] for hit in response['hits']['hits']]
        if record['document'] in ids:
            hits += 1
    return hits / len(ground_truth)

es_hitrate = es_hit_rate(ground_truth, num_results=5)
print(es_hitrate)


0.0


In [19]:
from elasticsearch import Elasticsearch

# Initialize Elasticsearch client
es_client = Elasticsearch("http://localhost:9200")  # Adjust the URL if needed

index_name = "documents"
dimension = 768  # Ensure this matches the embedding dimension

# Define the index mapping
mapping = {
    "mappings": {
        "properties": {
            "id": {"type": "keyword"},
            "question": {"type": "text"},
            "text": {"type": "text"},
            "question_vector": {
                "type": "dense_vector",
                "dims": dimension
            }
        }
    }
}

# Create the index
es_client.indices.create(index=index_name, body=mapping)


BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [documents/CRxzX9rcSvaIPnBnCsNYMw] already exists')

In [20]:
from sentence_transformers import SentenceTransformer
import requests

# Initialize the model
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

# Load documents
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

# Ensure only a subset of the documents
documents = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']

# Index documents
for doc in documents:
    qa_text = f"{doc['question']} {doc['text']}"
    doc['question_vector'] = embedding_model.encode(qa_text).tolist()
    es_client.index(index=index_name, document=doc)


BadRequestError: BadRequestError(400, 'mapper_parsing_exception', 'failed to parse')

In [None]:
def es_hit_rate(ground_truth, num_results=5):
    hits = 0
    for record in ground_truth:
        query = record['question']
        query_vector = embedding_model.encode(query)

        search_query = {
            "size": num_results,
            "query": {
                "script_score": {
                    "query": {
                        "match_all": {}
                    },
                    "script": {
                        "source": "cosineSimilarity(params.query_vector, 'question_vector') + 1.0",
                        "params": {
                            "query_vector": query_vector
                        }
                    }
                }
            }
        }

        response = es_client.search(index=index_name, body=search_query)
        ids = [hit['_source']['id'] for hit in response['hits']['hits']]
        if record['document'] in ids:
            hits += 1
    return hits / len(ground_truth)

es_hitrate = es_hit_rate(ground_truth, num_results=5)
print(es_hitrate)
