In [50]:
import numpy as np
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams

In [51]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

collection

In [5]:
documents = [
    "A function that sorts a list using bubble sort.",
    "A function that computes factorial of a number recursively.",
    "A class that implements a binary search tree.",
    "A function that finds the maximum element in a list.",
    "A function that calculates the sum of two numbers.",
]

generating embidings

In [6]:
vectors = model.encode(documents)

In [52]:
from datasets import load_dataset

In [53]:
corpus = load_dataset("parquet", data_files="data/corpus-00000-of-00001.parquet")["train"]
queries = load_dataset("parquet", data_files="data/queries-00000-of-00001.parquet")["train"]

train_rel = load_dataset("parquet", data_files="data/train-00000-of-00001.parquet")["train"]
test_rel = load_dataset("parquet", data_files="data/test-00000-of-00001.parquet")["train"]
valid_rel = load_dataset("parquet", data_files="data/valid-00000-of-00001.parquet")["train"]


In [9]:
corpus_embeddings = model.encode(corpus["text"], show_progress_bar=True)

Batches:   0%|          | 0/644 [00:00<?, ?it/s]

In [10]:
query_text = queries["text"][0]
query_embedding = model.encode([query_text])

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

In [55]:
def search_code(query):
    #query_emb = model.encode([query])
    similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
    top_k_idx = np.argsort(similarities)[::-1][:5]
    
    results = []
    for idx in top_k_idx:
        results.append({
            "score": float(similarities[idx]),
            "code": corpus["text"][idx],
            "title": corpus["title"][idx] if "title" in corpus.column_names else None
        })
    return results

In [56]:
for q in [
    "python function to reverse a string",
    "read csv file with pandas",
    "convert list to json string",
]:
    print(f"\n=== Query: {q} ===\n")
    results = search_code(q)
    for r in results:
        print(f"({r['score']:.3f}) {r['code'][:200]}...\n")


=== Query: python function to reverse a string ===

(0.716) def to_bool(value):
    # type: (Any) -> bool
    """
    Convert a value into a bool but handle "truthy" strings eg, yes, true, ok, y
    """
    if isinstance(value, _compat.string_types):
        r...

(0.715) def boolean(flag):
    """
    Convert string in boolean
    """
    s = flag.lower()
    if s in ('1', 'yes', 'true'):
        return True
    elif s in ('0', 'no', 'false'):
        return False
   ...

(0.710) def to_bool(value: Any) -> bool:
    """Convert string or other Python object to boolean.

    **Rationalle**

    Passing flags is one of the most common cases of using environment vars and
    as va...

(0.710) def to_bool(value: Any) -> bool:
    """Convert string or other Python object to boolean.

    **Rationalle**

    Passing flags is one of the most common cases of using environment vars and
    as va...

(0.681) def _if(ctx, logical_test, value_if_true=0, value_if_false=False):
    """
    Returns 

In [57]:
from sklearn.metrics import ndcg_score

In [15]:
corpus_texts = [row["text"] for row in corpus]
query_texts = [row["text"] for row in queries]

corpus_embeddings = model.encode(corpus_texts, convert_to_numpy=True, show_progress_bar=True)
query_embeddings = model.encode(query_texts, convert_to_numpy=True, show_progress_bar=True)


Batches:   0%|          | 0/644 [00:00<?, ?it/s]

Batches:   0%|          | 0/644 [00:00<?, ?it/s]

In [58]:
def evaluate_retrieval_fast(relations,top_k=10):
    corpus_ids = [row["_id"] for row in corpus]
    query_ids = [row["_id"] for row in queries]

    reciprocal_ranks = []
    recall_hits = 0
    ndcg_scores = []

    corpus_norms = np.linalg.norm(corpus_embeddings, axis=1)

    for rel in relations:
        qid = rel["query-id"]
        cid = rel["corpus-id"]

        q_idx = query_ids.index(qid)
        query_emb = query_embeddings[q_idx]
        query_norm = np.linalg.norm(query_emb)

        sims = np.dot(corpus_embeddings, query_emb) / (corpus_norms * query_norm)
        top_indices = np.argsort(sims)[::-1][:top_k]
        top_ids = [corpus_ids[i] for i in top_indices]

        if cid in top_ids:
            rank = top_ids.index(cid) + 1
            reciprocal_ranks.append(1.0 / rank)
            recall_hits += 1
        else:
            reciprocal_ranks.append(0.0)

        relevance = [1 if id == cid else 0 for id in corpus_ids]
        ndcg_scores.append(ndcg_score([relevance], [sims]))

    return {
        "MRR@k": np.mean(reciprocal_ranks),
        "Recall@k": recall_hits / len(relations),
        "nDCG@k": np.mean(ndcg_scores)
    }


In [59]:
sample_rel = train_rel.select(range(100))
metrics = evaluate_retrieval_fast(sample_rel, top_k=10)
print(metrics)

{'MRR@k': 0.2288809523809524, 'Recall@k': 0.57, 'nDCG@k': 0.38898190136831773}


In [60]:
from sentence_transformers import losses, InputExample
from torch.utils.data import DataLoader

In [61]:
query_map = {row["_id"]: row["text"] for row in queries}
corpus_map = {row["_id"]: row["text"] for row in corpus}

train_samples = []
for rel in train_rel:
    qid, cid, score = rel["query-id"], rel["corpus-id"], rel["score"]

    if qid in query_map and cid in corpus_map:
        query_text = query_map[qid]
        code_text = corpus_map[cid]
        train_samples.append(InputExample(texts=[query_text, code_text], label=float(score)))

In [62]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)

In [63]:
train_loss = losses.CosineSimilarityLoss(model)

In [64]:
import accelerate
print("Accelerate version:", accelerate.__version__)

Accelerate version: 1.11.0


In [65]:
from accelerate import Accelerator
accelerator = Accelerator()
model._target_device = accelerator.device

In [2]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100
)

NameError: name 'model' is not defined

In [None]:
model.save("fine_tuned_cosqa_model")

In [None]:
fine_tuned_model = SentenceTransformer("fine_tuned_cosqa_model")

metrics = evaluate_retrieval_fast(
    fine_tuned_model,
    queries,
    corpus,
    sample_rel,
    query_embeddings,
    corpus_embeddings,
    top_k=10
)

print(metrics)
