In [1]:
import sys
import json
import os
import numpy as np
import matplotlib.pyplot as plt
import scipy
from scipy.spatial.distance import euclidean
from scipy.signal import savgol_filter

from transformers import pipeline
from sklearn.decomposition import PCA

sys.path.append(os.path.abspath(".."))

from data.dataset_factory import get_dataset_generator
from data.data_generators.sourcecodeplag_dataset_gen import original_plag_triplet_generator
from preprocessing.embedding_chunks import get_ready_to_embed_chunks
from preprocessing.context_chunker import safe_get_ready_to_embed_context_chunks
from preprocessing.mean_pool_chunks import mean_pool_chunks
from preprocessing.block_splitter import deverbose_ast
from visualizer.smoothing import smooth_embeddings, smooth_multiple_embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
print(os.getcwd())
gen = get_dataset_generator(
    dataset_name="codeclonedataset",
    mode="pairs",
    dataset_root="data/code-clone-dataset/dataset",
    clone_type="type-3"
)


# Get a random set of functions
c = get_dataset_generator(
    dataset_name='sourcecodeplag',
    mode='plagiarized',
    **{}
)

nc = get_dataset_generator(
    dataset_name='sourcecodeplag',
    mode='non_plagiarized',
    **{}
)



/Users/jonas/Documents/NTNU/Bachelor/code-model-embeddings/notebooks


In [3]:
import random

def triplet_generator(c, nc):
    """
    Yields (anchor, clone, non_clone) triplets.

    - anchor, clone come from the plagiarized generator `c`
    - non_clone is randomly chosen from one side of the non-plagiarized generator `nc`
    """

    nc_iter = iter(nc)

    for plag_sample in c:
        try:
            non_plag_sample = next(nc_iter)
        except StopIteration:
            # Restart nc if it runs out
            nc_iter = iter(nc)
            non_plag_sample = next(nc_iter)

        anchor = plag_sample.code_a
        clone = plag_sample.code_b

        non_clone = non_plag_sample.code_b

        yield anchor, clone, non_clone

In [4]:
unixcoder = "microsoft/unixcoder-base"
unixoder_pipe = pipeline("feature-extraction", model=unixcoder)

codebert = "microsoft/codebert-base"
codebert_pipe = pipeline("feature-extraction", model=codebert)

graphcodebert = "microsoft/graphcodebert-base"
graphcodebert_pipe = pipeline("feature-extraction", model=graphcodebert)

models = {
    "UniXcoder": unixoder_pipe,
    "CodeBERT": codebert_pipe,
    "GraphCodeBERT": graphcodebert_pipe
}

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 1447.44it/s, Materializing param=pooler.dense.weight]                               
RobertaModel LOAD REPORT from: microsoft/unixcoder-base
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 2049.66it/s, Materializing param=pooler.dense.weight]                               
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 1977.51it/s, Materializing param=encoder.layer.11.output.dense.weight]              
RobertaModel LOAD REPORT from: microsoft/graphcodebert-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.decoder.bia

In [5]:
# Helper function: embed a list of snippets and mean-pool
def embed_and_pool(snippets, model_pipe):
    """
    Embed a list of code snippets and return mean-pooled vector.
    Works even if snippets produce nested token embeddings.
    """
    if not snippets:
        return None
    
    embeddings = model_pipe(snippets)  # list of snippets
    snippet_vecs = []

    for e in embeddings:
        # Flatten one level: sometimes e is [[vec1, vec2, ...]] or [[[vec1], [vec2]]]
        flat_tokens = []
        for tok in e:
            if isinstance(tok[0], list) or isinstance(tok[0], np.ndarray):
                # tok is [ [x1,x2,...], [y1,y2,...] ]
                flat_tokens.append(np.array(tok[0]))
            else:
                flat_tokens.append(np.array(tok))
        
        # Mean pool tokens per snippet
        token_mean = np.mean(np.stack(flat_tokens, axis=0), axis=0)
        snippet_vecs.append(token_mean)
    
    # Mean pool across snippets
    mean_vec = np.mean(np.stack(snippet_vecs, axis=0), axis=0)
    return mean_vec

In [6]:
all_embeddings = []

In [7]:
genidx = 0

for sample in original_plag_triplet_generator():
#for sample in gen:
    MAX_TOKENS = 50

    # Karnalim
    anchor = sample["anchor"]
    clone = sample["clone"]
    nonclone = sample["nonclone"]

    # Our dataset
    #anchor = sample[0]
    #clones = sample[1]
    #nonclones = sample[2]

    
    genidx += 1
    print(genidx)

    anchor_line_chunks_with_ast = get_ready_to_embed_chunks(anchor)
    anchor_context_chuks_with_ast = safe_get_ready_to_embed_context_chunks(anchor, max_tokens=MAX_TOKENS)

    anchor_line_chunks = [snippet for _, snippet, _ in anchor_line_chunks_with_ast]
    anchor_context_chunks = [snippet for _, snippet, _ in anchor_context_chuks_with_ast]

    clone = clone # first clone only

    clone_line_chunks_with_ast = get_ready_to_embed_chunks(clone)
    clone_context_chuks_with_ast = safe_get_ready_to_embed_context_chunks(clone, max_tokens=MAX_TOKENS)

    clone_line_chunks = [snippet for _, snippet, _ in clone_line_chunks_with_ast]
    clone_context_chunks = [snippet for _, snippet, _ in clone_context_chuks_with_ast]

    
    nc = nonclone # first nonclone only

    nc_line_chunks_with_ast = get_ready_to_embed_chunks(nc)
    nc_context_chuks_with_ast = safe_get_ready_to_embed_context_chunks(nc, max_tokens=MAX_TOKENS)

    nc_line_chunks = [snippet for _, snippet, _ in nc_line_chunks_with_ast]
    nc_context_chunks = [snippet for _, snippet, _ in nc_context_chuks_with_ast]

    # Embed all models and all chunk types
    sample_embeddings = {}
    for model_name, pipe in models.items():
        sample_embeddings[model_name] = {
            "anchor": {
                "line": embed_and_pool(anchor_line_chunks, pipe),
                "context": embed_and_pool(anchor_context_chunks, pipe)
            },
            "clone": {
                "line": embed_and_pool(clone_line_chunks, pipe),
                "context": embed_and_pool(clone_context_chunks, pipe)
            },
            "nonclone": {
                "line": embed_and_pool(nc_line_chunks, pipe),
                "context": embed_and_pool(nc_context_chunks, pipe)
            }
        }
        


    # Store per sample
    all_embeddings.append(sample_embeddings)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17


KeyboardInterrupt: 

In [None]:
def plot_embedding_lines(sample_embeddings, model_name):
    """
    Plot embeddings as smoothed lines, compute cumulative L2 distances,
    and show ratio (Anchor-NonClone / Anchor-Clone) in title.
    """
    emb = sample_embeddings[model_name]

    # Prepare embeddings
    anchor_line = emb["anchor"]["line"]
    clone_line = emb["clone"]["line"]
    nonclone_line = emb["nonclone"]["line"]

    anchor_ctx = emb["anchor"]["context"]
    clone_ctx = emb["clone"]["context"]
    nonclone_ctx = emb["nonclone"]["context"]

    # Smoothing using Savitzky-Golay
    window = 101  # must be odd and <= length of vector
    poly = 3
    def smooth(vec):
        w = min(window, len(vec) - (len(vec)+1)%2)  # ensure odd and <= len
        return savgol_filter(vec, w, poly, mode='nearest')

    # Smooth embeddings
    anchor_line_s = smooth(anchor_line)
    clone_line_s = smooth(clone_line)
    nonclone_line_s = smooth(nonclone_line)

    anchor_ctx_s = smooth(anchor_ctx)
    clone_ctx_s = smooth(clone_ctx)
    nonclone_ctx_s = smooth(nonclone_ctx)

    # --- FUNCTION TO COMPUTE CUMULATIVE L2 ---
    def cumulative_l2(vec1, vec2):
        diff = vec1 - vec2
        cum_l2 = np.sqrt(np.cumsum(diff ** 2))
        final_l2 = cum_l2[-1]
        return cum_l2, final_l2

    # Compute cumulative L2 for line chunks
    _, final_l2_ac_line = cumulative_l2(anchor_line_s, clone_line_s)
    _, final_l2_anc_line = cumulative_l2(anchor_line_s, nonclone_line_s)
    ratio_line = final_l2_anc_line / final_l2_ac_line if final_l2_ac_line != 0 else np.nan

    # Compute cumulative L2 for context chunks
    _, final_l2_ac_ctx = cumulative_l2(anchor_ctx_s, clone_ctx_s)
    _, final_l2_anc_ctx = cumulative_l2(anchor_ctx_s, nonclone_ctx_s)
    ratio_ctx = final_l2_anc_ctx / final_l2_ac_ctx if final_l2_ac_ctx != 0 else np.nan

    # X-axis is embedding index
    x_line = np.arange(len(anchor_line_s))
    x_ctx = np.arange(len(anchor_ctx_s))

    # --- PLOTTING LINE CHUNKS ---
    plt.figure(figsize=(12,4))
    plt.plot(x_line, anchor_line_s, color='darkgreen', label='Anchor', linewidth=2)
    plt.plot(x_line, clone_line_s, color='lightgreen', label='Clone', linewidth=2)
    plt.plot(x_line, nonclone_line_s, color='red', label='Non-clone', linewidth=2)
    plt.title(f"{model_name} - Line Chunks\n"
              f"Final L2 Anchor-Clone: {final_l2_ac_line:.3f}, "
              f"Anchor-NonClone: {final_l2_anc_line:.3f}, "
              f"Ratio (NC/Clone): {ratio_line:.2f}")
    plt.xlabel("Embedding Index")
    plt.ylabel("Embedding Value")
    plt.legend()
    plt.show()

    # --- PLOTTING CONTEXT CHUNKS ---
    plt.figure(figsize=(12,4))
    plt.plot(x_ctx, anchor_ctx_s, color='darkgreen', label='Anchor', linewidth=2)
    plt.plot(x_ctx, clone_ctx_s, color='lightgreen', label='Clone', linewidth=2)
    plt.plot(x_ctx, nonclone_ctx_s, color='red', label='Non-clone', linewidth=2)
    plt.title(f"{model_name} - Context Chunks\n"
              f"Final L2 Anchor-Clone: {final_l2_ac_ctx:.3f}, "
              f"Anchor-NonClone: {final_l2_anc_ctx:.3f}, "
              f"Ratio (NC/Clone): {ratio_ctx:.2f}")
    plt.xlabel("Embedding Index")
    plt.ylabel("Embedding Value")
    plt.legend()
    plt.show()


In [None]:
# Loop through all samples
"""
for i, sample in enumerate(all_embeddings):
    print(f"\n\n{'='*50}\nSample {i+1}\n{'='*50}\n")

    for model_name in ["UniXcoder", "CodeBERT", "GraphCodeBERT"]:
        print(f"\n--- Model: {model_name} ---\n")
        plot_embedding_lines(sample, model_name)
        """

'\nfor i, sample in enumerate(all_embeddings):\n    print(f"\n\n{\'=\'*50}\nSample {i+1}\n{\'=\'*50}\n")\n\n    for model_name in ["UniXcoder", "CodeBERT", "GraphCodeBERT"]:\n        print(f"\n--- Model: {model_name} ---\n")\n        plot_embedding_lines(sample, model_name)\n        '

In [None]:
def final_cuml2(a, b):
    """Final cumulative L2 distance"""
    diff = a - b
    return np.sqrt(np.cumsum(diff ** 2))[-1]


def average_cuml2_ratios(all_embeddings):
    """
    Compute average cumulative L2 ratio (ANC / AC) for line and context embeddings.
    Returns number of valid samples per model.
    """
    results = {}

    for model_name in all_embeddings[0].keys():
        line_ratios = []
        context_ratios = []

        for emb in all_embeddings:
            a_line = emb[model_name]["anchor"]["line"]
            c_line = emb[model_name]["clone"]["line"]
            nc_line = emb[model_name]["nonclone"]["line"]

            a_ctx = emb[model_name]["anchor"]["context"]
            c_ctx = emb[model_name]["clone"]["context"]
            nc_ctx = emb[model_name]["nonclone"]["context"]

            # Skip sample if any embedding is None
            if a_line is None or c_line is None or nc_line is None:
                continue
            if a_ctx is None or c_ctx is None or nc_ctx is None:
                continue

            # Compute final cumulative L2
            def final_cuml2(x, y):
                diff = x - y
                return np.sqrt(np.cumsum(diff ** 2))[-1]

            # LINE chunks
            ac_line = final_cuml2(a_line, c_line)
            anc_line = final_cuml2(a_line, nc_line)
            if ac_line > 0:
                line_ratios.append(anc_line / ac_line)

            # CONTEXT chunks
            ac_ctx = final_cuml2(a_ctx, c_ctx)
            anc_ctx = final_cuml2(a_ctx, nc_ctx)
            if ac_ctx > 0:
                context_ratios.append(anc_ctx / ac_ctx)

        results[model_name] = {
            "line_avg_ratio": np.mean(line_ratios) if line_ratios else None,
            "context_avg_ratio": np.mean(context_ratios) if context_ratios else None,
            "n_samples": len(line_ratios),  # or context_ratios; should be the same
        }

    return results




In [None]:
results = average_cuml2_ratios(all_embeddings)

for model, stats in results.items():
    print(f"{model}:")
    print(f"  LINE chunks    → ANC / AC = {stats['line_avg_ratio']:.2f}")
    print(f"  CONTEXT chunks → ANC / AC = {stats['context_avg_ratio']:.2f}")
    print(f"  Samples used   = {stats['n_samples']}\n")


UniXcoder:
  LINE chunks    → ANC / AC = 2.88
  CONTEXT chunks → ANC / AC = 2.61
  Samples used   = 349

CodeBERT:
  LINE chunks    → ANC / AC = 1.62
  CONTEXT chunks → ANC / AC = 1.22
  Samples used   = 349

GraphCodeBERT:
  LINE chunks    → ANC / AC = 1.60
  CONTEXT chunks → ANC / AC = 1.40
  Samples used   = 349

