In [1]:
# Install python dependencies
%pip install torch transformers huggingface_hub omegaconf datasets==2.16.1 
# Optinal python packages for better user experience
%pip install ipywidgets nbconvert

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import necessary libraries
import torch
import omegaconf
import collections
import os
import re
import numpy as np
import gc
from pathlib import Path
from typing import Any
from collections import OrderedDict
from transformers import DPRContextEncoder, AutoTokenizer, DPRConfig, GPT2TokenizerFast
from huggingface_hub import hf_hub_download
from datasets import load_dataset, Dataset, DatasetDict

# Setup external services authentication
HF_TOKEN = os.getenv('HF_TOKEN')

In [3]:
def rename_keys_substring(ordered_dict: OrderedDict[str, Any], find_pattern, replace_pattern):
    """
    Rename keys in an OrderedDict by replacing substring occurrences using regular expressions.
    
    Args:
        ordered_dict: The OrderedDict to modify
        find_pattern: The regex pattern to find in keys
        replace_pattern: The replacement pattern (can include backreferences like \\1, \\2)
    
    Returns:
        New Mapping with renamed keys
    """
    new_dict = OrderedDict[str, Any]()
    compiled_pattern = re.compile(find_pattern)
    
    for key, value in ordered_dict.items():
        if not compiled_pattern.search(key):
            continue
            
        new_key = compiled_pattern.sub(replace_pattern, key)
        new_dict[new_key] = value
    return new_dict


In [None]:
# Ensure that the necessary types are registered for safe deserialization
torch.serialization.add_safe_globals(
    [
        omegaconf.dictconfig.ContainerMetadata,
        omegaconf.dictconfig.DictConfig,
        omegaconf.base.Metadata,
        omegaconf.nodes.AnyNode,
        omegaconf.listconfig.ListConfig,
        collections.defaultdict,
        Any,
        dict,
        list,
        int,
    ]
)

# Check if CUDA is available and set device accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model from checkpoint
checkpoint_path = hf_hub_download(
    repo_id="NTU-NLP-sg/xCodeEval-nl-code-starencoder-ckpt-37",
    filename="dpr_biencoder.37.pt",
    repo_type="model",
    token=HF_TOKEN,
)
state_dict = torch.load(checkpoint_path, map_location=device)

# Retrieve fine-tuned weights
# Pattern matches: question_model.embeddings/encoder.* -> question_encoder.bert_model.*
ctx_state_dict = rename_keys_substring(
    state_dict["model_dict"],
    r"ctx_model\.(embeddings|encoder)\.([Ll]ayer|token|word|position_embeddings)",
    r"ctx_encoder.bert_model.\1.\2",
)

# Initialize encoders
pretrained_model_name = state_dict["encoder_params"]["encoder"]["pretrained_model_cfg"]
encoder_config = DPRConfig.from_pretrained(
    pretrained_model_name,
    token=HF_TOKEN,
)

ctx_encoder = DPRContextEncoder.from_pretrained(
    None, state_dict=ctx_state_dict, config=encoder_config, token=HF_TOKEN
)
ctx_encoder = ctx_encoder.to(device).eval()

# Compile for optimization (keeps same numerical results)
if hasattr(torch, 'compile'):
    ctx_encoder = torch.compile(ctx_encoder, mode='default')  # Use 'default' not 'max-autotune'
    print("Model compiled for optimization")

# Distribute workload across multiple GPUs if available
# Not so efficient as DistributedDataParallel, but simpler for single-node setups
# DataParallel will split the input across the GPUs and gather the outputs
# This is useful for inference or when the model is not too large
if torch.cuda.device_count() > 1:
    ctx_encoder = torch.nn.DataParallel(ctx_encoder)
    print(f"Using {torch.cuda.device_count()} GPUs for context encoder")

# Initialize tokenizer
tokenizer: GPT2TokenizerFast = AutoTokenizer.from_pretrained(
    pretrained_model_name, config=encoder_config
)
tokenizer.pad_token = tokenizer.eos_token

Using device: cuda


You are using a model of type bert to instantiate a model of type dpr. This is not supported for all configurations of models and can yield errors.


Model compiled for optimization


In [5]:
def print_memory_usage():
    """Print current memory usage statistics"""
    if torch.cuda.is_available():
        torch.cuda.synchronize()  # Ensure all GPU operations are complete
        for i in range(torch.cuda.device_count()):
            allocated = torch.cuda.memory_allocated(i) / (1024**3)
            reserved = torch.cuda.memory_reserved(i) / (1024**3)
            total = torch.cuda.get_device_properties(i).total_memory / (1024**3)
            print(f"GPU {i}: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved, {total:.2f}GB total")
    else:
        print("CUDA is not available. No GPU memory stats to report.")

def cleanup_memory():
    """Comprehensive memory cleanup"""
    # Clear Python garbage
    gc.collect()
    
    # Clear PyTorch cache
    torch.cuda.empty_cache()
    
    # Print memory stats
    print_memory_usage()

In [6]:
def embed_codes(batch: DatasetDict[str, list]):
    inputs = tokenizer(
        batch["source_code"],
        padding="max_length",
        truncation=True,
        max_length=1024,
        return_tensors="pt"  # Use PyTorch tensors 
    )
    # Move to device in one operation
    inputs = {k: v.to(device, non_blocking=True) for k, v in inputs.items()}
    
    with torch.no_grad(), torch.amp.autocast(device_type=device, dtype=torch.bfloat16, enabled=True):
        embeddings = ctx_encoder(**inputs).pooler_output
        print_memory_usage()
        # Convert to float16 to save memory if precision allows
        embeddings_cpu = embeddings.detach().cpu().to(torch.float32).tolist()
        return {"embedding": embeddings_cpu}
    

In [7]:
# Configure cache settings
CACHE_DIR = Path("./cache")
CACHE_DIR.mkdir(exist_ok=True)
CORPUS_CACHE_DIR = CACHE_DIR / "corpus_embeddings"

# Check if cache exists and load, otherwise process corpus
if CORPUS_CACHE_DIR.exists():
    try:
        print(f"Loading corpus cache from {CORPUS_CACHE_DIR}")
        corpus_with_embeddings = Dataset.load_from_disk(str(CORPUS_CACHE_DIR))
        print(f"Cache loaded successfully. Documents: {len(corpus_with_embeddings)}")
    except Exception as e:
        print(f"Failed to load cache: {e}")
        print("Cache directory exists but contains invalid data. Recreating cache...")
        corpus_with_embeddings = None
else:
    corpus_with_embeddings = None

if corpus_with_embeddings is None:
    print("No cache found. Processing corpus...")
    
    # Load corpus dataset
    corpus = load_dataset(
        "NTU-NLP-sg/xCodeEval",
        "retrieval_corpus",
        trust_remote_code=True,
        split="test",
        revision="467d25a839086383794b58055981221b82c0d107",
        token=HF_TOKEN,
    )
    
    # Generate embeddings
    corpus_with_embeddings = corpus.map(
        embed_codes,
        batched=True,
        batch_size=48
    )
    
    print("Embeddings generated successfully!")
    print(f"Saving corpus cache to {CORPUS_CACHE_DIR}")
    corpus_with_embeddings.save_to_disk(str(CORPUS_CACHE_DIR))
    print("Cache saved successfully!")

# Display information about the processed corpus
print(f"\nCorpus information:")
print(f"Number of documents: {len(corpus_with_embeddings)}")
if len(corpus_with_embeddings) > 0:
    print(f"Embedding dimension: {len(corpus_with_embeddings[0]['embedding'])}")
    print(f"Sample document keys: {list(corpus_with_embeddings[0].keys())}")
    print(f"Sample source code (first 200 chars): {corpus_with_embeddings[0]['source_code'][:200]}...")

No cache found. Processing corpus...


Map:   0%|          | 0/25043700 [00:00<?, ? examples/s]

GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25GB total
GPU 0: 3.39GB allocated, 6.20GB reserved, 79.25G

KeyboardInterrupt: 

Allocated GPU: 6.60GB
400 docs/s