In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from datasets import load_dataset
import os

In [4]:
dataset = load_dataset("json", data_files="../data/arcd_sample50.jsonl")["train"]

# We'll use Arabic embedding model from the Omartificial-Intelligence-Space collection in Hugging Face
model_name = "Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2"
model = SentenceTransformer(model_name)

# Extract contexts
texts = dataset["context"]

# Generate embeddings
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

# Save embeddings
os.makedirs("../data", exist_ok=True)
np.save("../data/context_embeddings.npy", embeddings)

# Save metadata
metadata = pd.DataFrame({
    "id": dataset["id"],
    "context": dataset["context"],
    "question": dataset["question"],
    "answer_text": [a["text"][0] for a in dataset["answers"]],
})
metadata.to_csv("../data/context_metadata.csv", index=False)

print("Saved embeddings to ../data/context_embeddings.npy ✔️")
print("Saved metadata to ../data/context_metadata.csv ✔️")
print("Embedding shape:", embeddings.shape)


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

Saved embeddings to ../data/context_embeddings.npy ✔️
Saved metadata to ../data/context_metadata.csv ✔️
Embedding shape: (50, 768)


In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
import numpy as np
import pandas as pd

In [None]:
embeddings = np.load("../data/context_embeddings.npy")
metadata = pd.read_csv("../data/context_metadata.csv")

client = QdrantClient(host="localhost", port=6333)

# create a collection for ARCD embeddings
client.recreate_collection(
    collection_name="arcd_collection",
    vectors_config=VectorParams(size=embeddings.shape[1], distance=Distance.COSINE)
)

# prepare points for upload
points = [
    {
        "id": int(metadata.loc[i, "id"]),
        "vector": embeddings[i].tolist(),
        "payload": {
            "context": metadata.loc[i, "context"],
            "question": metadata.loc[i, "question"],
            "answer_text": metadata.loc[i, "answer_text"]
        }
    }
    for i in range(len(embeddings))
]

# upload points
client.upsert(
    collection_name="arcd_collection",
    points=points
)

print("Embeddings uploaded to Qdrant ✔️")

In [None]:
def retrieve_context(query, top_k=5):
    # embed the query using the same embedding model
    query_embedding = model.encode([query], convert_to_numpy=True)[0]
    
    # search Qdrant
    results = client.search(
        collection_name="arcd_collection",
        query_vector=query_embedding,
        limit=top_k
    )
    
    # return top similar contexts
    retrieved_contexts = [res.payload["context"] for res in results]
    return retrieved_contexts

# test
query = "ما هو تعريف الذكاء الاصطناعي؟"
retrieved = retrieve_context(query)
print("Retrieved contexts:", retrieved)