In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams

In [4]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

collection

In [5]:
documents = [
    "A function that sorts a list using bubble sort.",
    "A function that computes factorial of a number recursively.",
    "A class that implements a binary search tree.",
    "A function that finds the maximum element in a list.",
    "A function that calculates the sum of two numbers.",
]

generating embidings

In [6]:
vectors = model.encode(documents)

In [7]:
from datasets import load_dataset

In [8]:
corpus = load_dataset("parquet", data_files="data/corpus-00000-of-00001.parquet")["train"]
queries = load_dataset("parquet", data_files="data/queries-00000-of-00001.parquet")["train"]

train_rel = load_dataset("parquet", data_files="data/train-00000-of-00001.parquet")["train"]
test_rel = load_dataset("parquet", data_files="data/test-00000-of-00001.parquet")["train"]
valid_rel = load_dataset("parquet", data_files="data/valid-00000-of-00001.parquet")["train"]

print("Corpus:", corpus)
print("Queries:", queries)
print("Train relations:", train_rel)
print("Test relations:", test_rel)
print("Valid relations:", valid_rel)

Corpus: Dataset({
    features: ['_id', 'partition', 'text', 'title', 'language', 'meta_information'],
    num_rows: 20604
})
Queries: Dataset({
    features: ['_id', 'partition', 'text', 'title', 'language', 'meta_information'],
    num_rows: 20604
})
Train relations: Dataset({
    features: ['query-id', 'corpus-id', 'score'],
    num_rows: 19604
})
Test relations: Dataset({
    features: ['query-id', 'corpus-id', 'score'],
    num_rows: 500
})
Valid relations: Dataset({
    features: ['query-id', 'corpus-id', 'score'],
    num_rows: 500
})


In [9]:
corpus_embeddings = model.encode(corpus["text"], show_progress_bar=True)

Batches:   0%|          | 0/644 [00:00<?, ?it/s]

In [10]:
query_text = queries["text"][0]
query_embedding = model.encode([query_text])

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
def search_code(query, top_k=5):
    query_emb = model.encode([query])
    similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
    top_k_idx = np.argsort(similarities)[::-1][:5]
    
    results = []
    for idx in top_k_idx:
        results.append({
            "score": float(similarities[idx]),
            "code": corpus["text"][idx],
            "title": corpus["title"][idx] if "title" in corpus.column_names else None
        })
    return results

In [13]:
for q in [
    "python function to reverse a string",
    "read csv file with pandas",
    "convert list to json string",
]:
    print(f"\n=== Query: {q} ===\n")
    results = search_code(q)
    for r in results:
        print(f"({r['score']:.3f}) {r['code'][:200]}...\n")


=== Query: python function to reverse a string ===

(0.716) def to_bool(value):
    # type: (Any) -> bool
    """
    Convert a value into a bool but handle "truthy" strings eg, yes, true, ok, y
    """
    if isinstance(value, _compat.string_types):
        r...

(0.715) def boolean(flag):
    """
    Convert string in boolean
    """
    s = flag.lower()
    if s in ('1', 'yes', 'true'):
        return True
    elif s in ('0', 'no', 'false'):
        return False
   ...

(0.710) def to_bool(value: Any) -> bool:
    """Convert string or other Python object to boolean.

    **Rationalle**

    Passing flags is one of the most common cases of using environment vars and
    as va...

(0.710) def to_bool(value: Any) -> bool:
    """Convert string or other Python object to boolean.

    **Rationalle**

    Passing flags is one of the most common cases of using environment vars and
    as va...

(0.681) def _if(ctx, logical_test, value_if_true=0, value_if_false=False):
    """
    Returns 

In [14]:
from sklearn.metrics import ndcg_score

In [None]:
def evaluate_retrieval(model, queries, corpus, relations, top_k=10):

    corpus_map = {row["_id"]: row["text"] for row in corpus}
    query_map = {row["_id"]: row["text"] for row in queries}

    reciprocal_ranks = []
    recall_hits = 0
    ndcg_scores = []

    for rel in relations:
        qid = rel["query-id"]
        cid = rel["corpus-id"]

        query_emb = model.encode(query_map[qid])
        corpus_embs = model.encode(list(corpus_map.values()))

        sims = np.dot(corpus_embs, query_emb) / (
            np.linalg.norm(corpus_embs, axis=1) * np.linalg.norm(query_emb)
        )

        ranked_ids = np.argsort(sims)[::-1]
        top_ids = [list(corpus_map.keys())[i] for i in ranked_ids[:top_k]]

        if cid in top_ids:
            rank = top_ids.index(cid) + 1
            reciprocal_ranks.append(1.0 / rank)
        else:
            reciprocal_ranks.append(0.0)

        if cid in top_ids:
            recall_hits += 1

        relevance = [1 if i == cid else 0 for i in list(corpus_map.keys())]
        ndcg_scores.append(ndcg_score([relevance], [sims]))

    mrr = np.mean(reciprocal_ranks)
    recall = recall_hits / len(relations)
    ndcg = np.mean(ndcg_scores)

    return {"MRR@k": mrr, "Recall@k": recall, "nDCG@k": ndcg}

metrics = evaluate_retrieval(model, queries, corpus, train_rel, top_k=10)
print(metrics)