In [6]:
from transformers import pipeline, RobertaTokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd
import numpy as np
from collections import Counter
from math import sqrt
import shutil
from datasets import Dataset

In [7]:
df = pd.read_csv('deep_seek_summaries.csv')
df

Unnamed: 0,func_code_string,func_documentation_string,Summaries
0,protected final void fastPathOrderedEmit(U val...,Makes sure the fast-path emits in order.\n@par...,The `fastPathOrderedEmit` method handles the a...
1,@CheckReturnValue\n @NonNull\n @Schedule...,Mirrors the one ObservableSource in an Iterabl...,"This Java function, `amb`, creates an Observab..."
2,"@SuppressWarnings(""unchecked"")\n @CheckRetu...",Mirrors the one ObservableSource in an array o...,The `ambArray` function is a static method tha...
3,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...",Concatenates elements of each ObservableSource...,"This Java function, `concat`, generates an `Ob..."
4,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...",Returns an Observable that emits the items emi...,"This Java function, `concat`, takes an `Observ..."
...,...,...,...
6220,"public Object setProperty(final String iName, ...",Sets a property value\n\n@param iName Propert...,The `setProperty` function updates or removes ...
6221,"public Object execute(final Map<Object, Object...",Execute the CREATE CLASS.,The `execute` function is designed to conditio...
6222,@Override\n public long count() {\n if (ha...,(Blueprints Extension) Counts the total items ...,This function calculates the count of edges co...
6223,"public Object execute(final Map<Object, Object...",Execute the SYNC DATABASE.,The `execute` function synchronizes a database...


In [8]:
df.rename(columns={'func_documentation_string': 'Query'}, inplace=True)
#df.drop(columns=['Similarity_Score'], inplace=True)
df.rename(columns={'Summaries': 'Summary'}, inplace=True)
df.rename(columns={'func_code_string': 'Code'}, inplace=True)
df

Unnamed: 0,Code,Query,Summary
0,protected final void fastPathOrderedEmit(U val...,Makes sure the fast-path emits in order.\n@par...,The `fastPathOrderedEmit` method handles the a...
1,@CheckReturnValue\n @NonNull\n @Schedule...,Mirrors the one ObservableSource in an Iterabl...,"This Java function, `amb`, creates an Observab..."
2,"@SuppressWarnings(""unchecked"")\n @CheckRetu...",Mirrors the one ObservableSource in an array o...,The `ambArray` function is a static method tha...
3,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...",Concatenates elements of each ObservableSource...,"This Java function, `concat`, generates an `Ob..."
4,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...",Returns an Observable that emits the items emi...,"This Java function, `concat`, takes an `Observ..."
...,...,...,...
6220,"public Object setProperty(final String iName, ...",Sets a property value\n\n@param iName Propert...,The `setProperty` function updates or removes ...
6221,"public Object execute(final Map<Object, Object...",Execute the CREATE CLASS.,The `execute` function is designed to conditio...
6222,@Override\n public long count() {\n if (ha...,(Blueprints Extension) Counts the total items ...,This function calculates the count of edges co...
6223,"public Object execute(final Map<Object, Object...",Execute the SYNC DATABASE.,The `execute` function synchronizes a database...


In [9]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
from transformers import AutoTokenizer, AutoModel
import torch

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5", trust_remote_code=True)
model = AutoModel.from_pretrained("BAAI/bge-base-en-v1.5", trust_remote_code=True)

In [10]:
# Function to get embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # Extract CLS token

In [12]:
# Generate embeddings for Summaries (this is what we compare against)
summary_embeddings = np.array([get_embedding(summary) for summary in df["Summary"].tolist()])

In [13]:
# Generate embeddings for Queries (docstrings from CodeSearchNet)
query_embeddings = np.array([get_embedding(query) for query in df["Query"].tolist()])

loading the embeddings

In [15]:
summary_embeddings = np.load("intfloate5-base-v2/summary_embeddings.npy")
query_embeddings = np.load("intfloate5-base-v2/query_embeddings.npy")

cosine function

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_top_matches(query_embedding, summary_embeddings, df, top_n):
    similarities = cosine_similarity([query_embedding], summary_embeddings)[0]
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    return [df.iloc[i]["Code"] for i in top_indices] 

euclidean function

In [17]:
# Function to find top matches for each query
def find_top_matches_euclidean(query_embedding, summary_embeddings, df, top_n):
    distances = [euclidean(query_embedding, emb) for emb in summary_embeddings]  # Compute distances
    top_indices = np.argsort(distances)[:top_n]  # Get indices of top matches

    # Return the top matching code snippets
    return [df.iloc[i]["Code"] for i in top_indices]

DOT function

In [18]:
# Function to find top matches using dot product similarity
def find_top_matches_dot(query_embedding, summary_embeddings, df, top_n):
    similarities = np.dot(summary_embeddings, query_embedding)  # Compute dot products
    top_indices = np.argsort(similarities)[-top_n:][::-1]  # Get indices of top matches (highest first)

    # Return the top matching code snippets
    return [df.iloc[i]["Code"] for i in top_indices]


Cosine top matches

In [19]:
# רשימת ערכים של K
k_values = [1, 3, 5, 10, 100, 1000]

summary_results = []

# Loop over each K and calculate the metrics
for k in k_values:
    print(f"\nEvaluating for Top-{k}...")

    # Initialize lists to store evaluation results
    mrr_scores = []
    recall_at_k = []

    # Evaluate each query
    for i, (query_text, query_emb) in enumerate(zip(df["Query"], query_embeddings)):
        expected_code = df.iloc[i]["Code"]
        top_matches = cosine_top_matches(query_emb, summary_embeddings, df, top_n=k)

        rank = next((i + 1 for i, code in enumerate(top_matches) if code == expected_code), 0)
        mrr_score = 1 / rank if rank > 0 else 0
        mrr_scores.append(mrr_score)

        recall_at_k.append(1 if expected_code in top_matches else 0)

    # Calculate final metrics for this K
    final_mrr = round(np.mean(mrr_scores), 4)
    final_recall = round(np.mean(recall_at_k), 4)

    # Append to summary results
    summary_results.append({
        "K": k,
        "Recall": final_recall,
        "MRR": final_mrr
    })

    print(f"K={k}: Recall={final_recall}, MRR={final_mrr}")

# Convert the summary to a dataframe
summary_df = pd.DataFrame(summary_results)

# Save the summary table to CSV
summary_df.to_csv("cosine_summary_results_by_k.csv", index=False)

# Display the summary
from IPython.display import display
display(summary_df)

print("\nSummary completed! Results saved to 'summary_results_by_k.csv'.")


Evaluating for Top-1...
K=1: Recall=0.3288, MRR=0.3288

Evaluating for Top-3...
K=3: Recall=0.5181, MRR=0.412

Evaluating for Top-5...
K=5: Recall=0.5965, MRR=0.4299

Evaluating for Top-10...
K=10: Recall=0.6837, MRR=0.4415

Evaluating for Top-100...
K=100: Recall=0.8771, MRR=0.4498

Evaluating for Top-1000...
K=1000: Recall=0.9622, MRR=0.4501


Unnamed: 0,K,Recall,MRR
0,1,0.3288,0.3288
1,3,0.5181,0.412
2,5,0.5965,0.4299
3,10,0.6837,0.4415
4,100,0.8771,0.4498
5,1000,0.9622,0.4501



Summary completed! Results saved to 'summary_results_by_k.csv'.


DOT top matches

In [20]:
# רשימת ערכים של K
k_values = [1, 3, 5, 10, 100, 1000]

summary_results = []

# Loop over each K and calculate the metrics
for k in k_values:
    print(f"\nEvaluating for Top-{k}...")

    # Initialize lists to store evaluation results
    mrr_scores = []
    recall_at_k = []

    # Evaluate each query
    for i, (query_text, query_emb) in enumerate(zip(df["Query"], query_embeddings)):
        expected_code = df.iloc[i]["Code"]
        top_matches = find_top_matches_dot(query_emb, summary_embeddings, df, top_n=k)

        rank = next((i + 1 for i, code in enumerate(top_matches) if code == expected_code), 0)
        mrr_score = 1 / rank if rank > 0 else 0
        mrr_scores.append(mrr_score)

        recall_at_k.append(1 if expected_code in top_matches else 0)

    # Calculate final metrics for this K
    final_mrr = round(np.mean(mrr_scores), 4)
    final_recall = round(np.mean(recall_at_k), 4)

    # Append to summary results
    summary_results.append({
        "K": k,
        "Recall": final_recall,
        "MRR": final_mrr
    })

    print(f"K={k}: Recall={final_recall}, MRR={final_mrr}")

# Convert the summary to a dataframe
summary_df = pd.DataFrame(summary_results)

# Save the summary table to CSV
summary_df.to_csv("dot_summary_results_by_k.csv", index=False)

# Display the summary
from IPython.display import display
display(summary_df)

print("\nSummary completed! Results saved to 'summary_results_by_k.csv'.")


Evaluating for Top-1...
K=1: Recall=0.2472, MRR=0.2472

Evaluating for Top-3...
K=3: Recall=0.3936, MRR=0.3113

Evaluating for Top-5...
K=5: Recall=0.4644, MRR=0.3274

Evaluating for Top-10...
K=10: Recall=0.5465, MRR=0.3386

Evaluating for Top-100...
K=100: Recall=0.7904, MRR=0.3489

Evaluating for Top-1000...
K=1000: Recall=0.9375, MRR=0.3496


Unnamed: 0,K,Recall,MRR
0,1,0.2472,0.2472
1,3,0.3936,0.3113
2,5,0.4644,0.3274
3,10,0.5465,0.3386
4,100,0.7904,0.3489
5,1000,0.9375,0.3496



Summary completed! Results saved to 'summary_results_by_k.csv'.


Euclidian top matches

In [21]:
# רשימת ערכים של K
k_values = [1, 3, 5, 10, 100, 1000]

summary_results = []

# Loop over each K and calculate the metrics
for k in k_values:
    print(f"\nEvaluating for Top-{k}...")

    # Initialize lists to store evaluation results
    mrr_scores = []
    recall_at_k = []

    # Evaluate each query
    for i, (query_text, query_emb) in enumerate(zip(df["Query"], query_embeddings)):
        expected_code = df.iloc[i]["Code"]
        top_matches = find_top_matches_euclidean(query_emb, summary_embeddings, df, top_n=k)

        rank = next((i + 1 for i, code in enumerate(top_matches) if code == expected_code), 0)
        mrr_score = 1 / rank if rank > 0 else 0
        mrr_scores.append(mrr_score)

        recall_at_k.append(1 if expected_code in top_matches else 0)

    # Calculate final metrics for this K
    final_mrr = round(np.mean(mrr_scores), 4)
    final_recall = round(np.mean(recall_at_k), 4)

    # Append to summary results
    summary_results.append({
        "K": k,
        "Recall": final_recall,
        "MRR": final_mrr
    })

    print(f"K={k}: Recall={final_recall}, MRR={final_mrr}")

# Convert the summary to a dataframe
summary_df = pd.DataFrame(summary_results)

# Save the summary table to CSV
summary_df.to_csv("euclidean_summary_results_by_k.csv", index=False)

# Display the summary
from IPython.display import display
display(summary_df)

print("\nSummary completed! Results saved to 'summary_results_by_k.csv'.")


Evaluating for Top-1...
K=1: Recall=0.2541, MRR=0.2541

Evaluating for Top-3...
K=3: Recall=0.4336, MRR=0.3333

Evaluating for Top-5...
K=5: Recall=0.5108, MRR=0.351

Evaluating for Top-10...
K=10: Recall=0.6048, MRR=0.3636

Evaluating for Top-100...
K=100: Recall=0.8363, MRR=0.3734

Evaluating for Top-1000...
K=1000: Recall=0.952, MRR=0.3739


Unnamed: 0,K,Recall,MRR
0,1,0.2541,0.2541
1,3,0.4336,0.3333
2,5,0.5108,0.351
3,10,0.6048,0.3636
4,100,0.8363,0.3734
5,1000,0.952,0.3739



Summary completed! Results saved to 'summary_results_by_k.csv'.


cross-encoder from huggingface

In [14]:
np.save("summary_BAAI_bge-base-en-v1.5.npy", summary_embeddings)
np.save("query_BAAI_bge-base-en-v1.5.npy", query_embeddings)