# Setup

In [1]:
# Install python dependencies
%pip install torch transformers huggingface_hub omegaconf datasets==2.16.1 tqdm 
# Optinal python packages for better user experience
%pip install ipywidgets nbconvert
# Install conda dependencies
%conda install -c conda-forge -c pytorch -c nvidia faiss-gpu=1.11.0 

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Channels:
 - conda-forge
 - pytorch
 - nvidia
 - defaults
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 25.1.1
    latest version: 25.5.1

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import necessary libraries
import torch
import omegaconf
import collections
import os
import re
import numpy as np
import faiss
from tqdm import tqdm
from typing import Any
from collections import OrderedDict
from transformers import DPRQuestionEncoder, DPRContextEncoder, AutoTokenizer, DPRConfig, GPT2Tokenizer
from huggingface_hub import hf_hub_download
from datasets import load_dataset

# Setup external services authentication
HF_TOKEN = os.getenv('HF_TOKEN')

# Model Loading

In [3]:
def rename_keys_substring(ordered_dict: OrderedDict[str, Any], find_pattern, replace_pattern):
    """
    Rename keys in an OrderedDict by replacing substring occurrences using regular expressions.
    
    Args:
        ordered_dict: The OrderedDict to modify
        find_pattern: The regex pattern to find in keys
        replace_pattern: The replacement pattern (can include backreferences like \\1, \\2)
    
    Returns:
        New Mapping with renamed keys
    """
    new_dict = OrderedDict[str, Any]()
    compiled_pattern = re.compile(find_pattern)
    
    for key, value in ordered_dict.items():
        if not compiled_pattern.search(key):
            continue
            
        new_key = compiled_pattern.sub(replace_pattern, key)
        new_dict[new_key] = value
    return new_dict


In [4]:
# Ensure that the necessary types are registered for safe deserialization
torch.serialization.add_safe_globals(
    [
        omegaconf.dictconfig.ContainerMetadata,
        omegaconf.dictconfig.DictConfig,
        omegaconf.base.Metadata,
        omegaconf.nodes.AnyNode,
        omegaconf.listconfig.ListConfig,
        collections.defaultdict,
        Any,
        dict,
        list,
        int,
    ]
)

# Check if CUDA is available and set device accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model from checkpoint
checkpoint_path = hf_hub_download(
    repo_id="NTU-NLP-sg/xCodeEval-nl-code-starencoder-ckpt-37",
    filename="dpr_biencoder.37.pt",
    repo_type="model",
    token=HF_TOKEN,
)
state_dict = torch.load(checkpoint_path, map_location=device)

# Retrieve fine-tuned weights
# Pattern matches: question_model.embeddings/encoder.* -> question_encoder.bert_model.*
question_state_dict = rename_keys_substring(
    state_dict["model_dict"],
    r"question_model\.(embeddings|encoder)\.([Ll]ayer|token|word|position_embeddings)",
    r"question_encoder.bert_model.\1.\2",
)
ctx_state_dict = rename_keys_substring(
    state_dict["model_dict"],
    r"ctx_model\.(embeddings|encoder)\.([Ll]ayer|token|word|position_embeddings)",
    r"ctx_encoder.bert_model.\1.\2",
)


# Initialize encoders
pretrained_model_name = state_dict["encoder_params"]["encoder"]["pretrained_model_cfg"]
encoder_config = DPRConfig.from_pretrained(
    pretrained_model_name,
    token=HF_TOKEN,
)

question_encoder = DPRQuestionEncoder.from_pretrained(
    None, state_dict=question_state_dict, config=encoder_config, token=HF_TOKEN
)
ctx_encoder = DPRContextEncoder.from_pretrained(
    None, state_dict=ctx_state_dict, config=encoder_config, token=HF_TOKEN
)

question_encoder = question_encoder.to(device).eval()
ctx_encoder = ctx_encoder.to(device).eval()

tokenizer: GPT2Tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)


Using device: cpu


You are using a model of type bert to instantiate a model of type dpr. This is not supported for all configurations of models and can yield errors.


# Data preparation

In [None]:
# Load NL-Code retrieval data
nl_code_test = load_dataset(
    "NTU-NLP-sg/xCodeEval",
    "retrieval_nl_code",
    trust_remote_code=True,
    split="test",
    revision="467d25a839086383794b58055981221b82c0d107",
    token=HF_TOKEN,
)
corpus_test = load_dataset(
    "NTU-NLP-sg/xCodeEval",
    "retrieval_corpus",
    trust_remote_code=True,
    split="test",
    revision="467d25a839086383794b58055981221b82c0d107",
    token=HF_TOKEN,
)

# Precompute code embeddings and store metadata
all_code_strings = corpus_test["source_code"]
src_uids = corpus_test["src_uid"]  # Must add this field during data prep!


def embed_codes(batch):
    inputs = tokenizer(
        batch["source_code"],
        padding="max_length",
        truncation=True,
        max_length=1024,
        return_tensors="pt",
    ).to(device)
    with torch.no_grad():
        return {"embedding": ctx_encoder(**inputs).pooler_output.cpu().numpy()}


corpus_test = corpus_test.map(embed_codes, batched=True, batch_size=48)

Traceback (most recent call last):
  File "/home/henrique/source/repos/nlp-tf/.conda/lib/python3.12/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_vars.py", line 622, in change_attr_expression
    value = eval(expression, frame.f_globals, frame.f_locals)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<string>", line 1, in <module>
NameError: name 'Dataset' is not defined


In [None]:
# Build GPU-accelerated FAISS index
embeddings = corpus_test["embedding"]
dimension = embeddings.shape[1]

index = faiss.IndexFlatIP(dimension)
index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
index.add(embeddings)

In [None]:
def evaluate_topk(k_vals=[1, 5, 10, 100]):
    results = {k: 0 for k in k_vals}
    
    for example in tqdm(nl_code_test):
        # Encode NL query
        inputs = tokenizer(
            example["nl"],
            padding="max_length",
            truncation=True,
            max_length=1024,
            return_tensors="pt"
        ).to(device)
        with torch.no_grad():
            query_embed = question_encoder(**inputs).pooler_output.cpu().numpy()
        
        # Retrieve top-k results
        distances, indices = index.search(query_embed, max(k_vals))
        
        # Check for relevant matches
        retrieved_uids = [src_uids[i] for i in indices[0]]
        for k in k_vals:
            if example["src_uid"] in retrieved_uids[:k]:
                results[k] += 1
                
    # Calculate accuracy
    total = len(nl_code_test)
    return {k: v / total for k, v in results.items()}

# Run evaluation
topk_acc = evaluate_topk()
print("Top-K Accuracy:", topk_acc)