In [None]:
import numpy as np
from fastembed import TextEmbedding


embedding_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")

query = 'I just discovered the course. Can I join now?'

query_embedding = list(embedding_model.embed([query]))[0]

print(f"Vector's size: {query_embedding.shape}")

min_value = np.min(query_embedding)
print(f"Smallest value in the victor: {min_value}")

  from .autonotebook import tqdm as notebook_tqdm


Vector's size: (512,)
Smallest value in the victor: -0.11726373885183883


In [None]:
doc_text = 'Can I still join the course after the start date?'

# Convert generator to list before indexing
doc_embedding = list(embedding_model.embed([doc_text]))[0]

similarity = np.dot(query_embedding, doc_embedding)

print(f"Cosine Similarity:  {similarity}")

Cosine Similarity:  0.9008528895674548


In [None]:
documents = [
    {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'},
    {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.', 'section': 'General course-related questions', 'question': 'Course - Can I follow the course after it finishes?', 'course': 'data-engineering-zoomcamp'},
    {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", 'section': 'General course-related questions', 'question': 'Course - When will the course start?', 'course': 'data-engineering-zoomcamp'},
    {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.', 'section': 'General course-related questions', 'question': 'Course - What can I do before the course starts?', 'course': 'data-engineering-zoomcamp'},
    {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.', 'section': 'General course-related questions', 'question': 'How can we contribute to the course?', 'course': 'data-engineering-zoomcamp'}
]


texts = [doc['text'] for doc in documents]

doc_embeddings = list(embedding_model.embed(texts))

V = np.array(doc_embeddings)

scores = V.dot(query_embedding)

highest_score_index = np.argmax(scores)

print(f"Simillarity degress: {scores}")
print(f"index of highest similarity: {highest_score_index}")

Simillarity degress: [0.76296845 0.81823782 0.80853974 0.71330788 0.73044992]
index of highest similarity: 1


In [None]:

full_texts = [f"{doc['question']} {doc['text']}" for doc in documents]

full_text_embeddings = list(embedding_model.embed(full_texts))

V_full = np.array(full_text_embeddings)

scores_full = V_full.dot(query_embedding)

highest_score_index_full = np.argmax(scores_full)

print(f"Degree of similarity: {scores_full}")
print(f"The index of highest similarity: {highest_score_index_full}")

درجات التشابه للنص المدمج: [0.85145432 0.8436594  0.84082872 0.77551577 0.80860079]
فهرس المستند الأعلى تشابهًا (نص مدمج) هو: 0


In [None]:
from fastembed import TextEmbedding

embedder = TextEmbedding(model_name="BAAI/bge-small-en")

# There is no callable interface for TextEmbedding to list models.
# If you want to print the model name and dimension, you can access the attributes directly.
print(f"Model: {embedder.model_name}")

# If you want to check the embedding dimension, you can embed a sample and check its shape:
sample_embedding = list(embedder.embed(["sample text"]))[0]
print(f"Dimensions: {sample_embedding.shape[0]}")



Model: BAAI/bge-small-en
Dimensions: 384


In [None]:
from fastembed import TextEmbedding
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams, Distance, CollectionStatus
import requests

# Step 1: Load documents
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
documents_raw = requests.get(docs_url).json()

documents = []
for course in documents_raw:
    if course['course'] != 'machine-learning-zoomcamp':
        continue
    for doc in course['documents']:
        doc['course'] = course['course']
        documents.append(doc)

# Step 2: Initialize embedding model and Qdrant client
model_name = 'BAAI/bge-small-en'
embedder = TextEmbedding(model_name=model_name)

client = QdrantClient(":memory:")  # in-memory; replace with host/port if persistent

# Step 3: Create Qdrant collection
collection_name = "faq_collection"
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)

# Step 4: Insert documents
texts = [f"{doc['question']} {doc['text']}" for doc in documents]
embeddings = list(embedder.embed(texts))

points = [
    PointStruct(id=i, vector=embeddings[i], payload=documents[i])
    for i in range(len(documents))
]

client.upsert(collection_name=collection_name, points=points)

# Step 5: Query with question
query = "I just discovered the course. Can I join now?"
query_vector = list(embedder.embed([query]))[0]

search_result = client.search(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=1
)

print("Top Score:", search_result[0].score)

  client.recreate_collection(


In [2]:
%pip install fastembed

Collecting fastembed
  Downloading fastembed-0.7.1-py3-none-any.whl.metadata (10 kB)
Collecting loguru<0.8.0,>=0.7.2 (from fastembed)
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting mmh3<6.0.0,>=4.1.0 (from fastembed)
  Downloading mmh3-5.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting onnxruntime!=1.20.0,>=1.17.0 (from fastembed)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting py-rust-stemmers<0.2.0,>=0.1.0 (from fastembed)
  Downloading py_rust_stemmers-0.1.5-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting coloredlogs (from onnxruntime!=1.20.0,>=1.17.0->fastembed)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime!=1.20.0,>=1.17.0->fastembed)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2

In [4]:
%pip install qdrant-client

Collecting qdrant-client
  Downloading qdrant_client-1.14.3-py3-none-any.whl.metadata (10 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading qdrant_client-1.14.3-py3-none-any.whl (328 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.0/329.0 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, qdrant-client
Successfully installed portalocker-2.10.1 qdrant-client-1.14.3


In [5]:
from fastembed import TextEmbedding
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
import requests

# 1. Download and filter ML Zoomcamp documents
url = "https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json"
docs = [doc for c in requests.get(url).json() if c["course"] == "machine-learning-zoomcamp" for doc in c["documents"]]

# 2. Prepare text: combine question and answer
texts = [doc["question"] + " " + doc["text"] for doc in docs]

# 3. Embed the texts using a small embedding model
embedder = TextEmbedding("BAAI/bge-small-en")
vectors = list(embedder.embed(texts))

# 4. Index vectors in an in-memory Qdrant collection
client = QdrantClient(":memory:")
client.recreate_collection("faq", vectors_config=VectorParams(size=384, distance=Distance.COSINE))
client.upsert("faq", [PointStruct(id=i, vector=vectors[i], payload={"text": texts[i]}) for i in range(len(texts))])

# 5. Embed the query
query = "I just discovered the course. Can I join now?"
query_vector = list(embedder.embed([query]))[0]

# 6. Search and print top result score
results = client.search("faq", query_vector=query_vector, limit=1)
print("✅ Top match score:", round(results[0].score, 2))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

model_optimized.onnx:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

  client.recreate_collection("faq", vectors_config=VectorParams(size=384, distance=Distance.COSINE))


✅ Top match score: 0.87


  results = client.search("faq", query_vector=query_vector, limit=1)
