# Setup

In [1]:
# Install python dependencies
%pip install  omegaconf scikit-learn datasets==2.16.1 tqdm numpy
# Optinal python packages for better user experience
%pip install ipywidgets nbconvert 

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import necessary libraries
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tqdm
import numpy as np

In [None]:
# 2. Load dataset
nl_code_test = load_dataset(
    "NTU-NLP-sg/xCodeEval",
    "retrieval_nl_code",
    trust_remote_code=True,
    split="test",
    revision="467d25a839086383794b58055981221b82c0d107"
)
corpus = load_dataset(
    "NTU-NLP-sg/xCodeEval",
    "retrieval_corpus",
    trust_remote_code=True,
    split="test",
    revision="467d25a839086383794b58055981221b82c0d107"
)

# 3. Fit TF-IDF on the corpus
vectorizer = TfidfVectorizer(
    input='content', # input is a sequence of strings
    analyzer='word', # split the text into words, not characters or n-grams
    token_pattern=r'\w+|[^\s\w]', # Try to match variables, functions, and symbols
    lowercase=False, # code is case-sensitive (structured data)
    max_features=10000  # Vocabulary size. 10000 ~ 32 GB RAM
)
corpus_tfidf = vectorizer.fit_transform(corpus["source_code"])



Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

In [None]:
# 5. Retrieval and evaluation
def evaluate_tfidf_topk(k_vals=[100]):
    results = {k: 0 for k in k_vals}
    src_uids = corpus["src_uid"]
    total = len(nl_code_test)
    for example in tqdm(nl_code_test):
        # Transform the NL query to TF-IDF vector
        query_vec = vectorizer.transform([example["nl"]])
        # Compute cosine similarity
        sims = cosine_similarity(query_vec, corpus_tfidf).flatten()
        # Get top-k indices
        top_indices = np.argsort(sims)[::-1]
        retrieved_uids = [src_uids[i] for i in top_indices]
        for k in k_vals:
            if example["src_uid"] in retrieved_uids[:k]:
                results[k] += 1
    return {k: v / total for k, v in results.items()}



In [None]:
# 6. Run evaluation
topk_acc = evaluate_tfidf_topk()
print("Top-K Accuracy:", topk_acc)
