In [None]:
import sys
import json
import os
import numpy as np
import matplotlib.pyplot as plt
import scipy
from scipy.spatial.distance import euclidean
from scipy.signal import savgol_filter
from dataclasses import dataclass

from transformers import pipeline
from sklearn.decomposition import PCA

sys.path.append(os.path.abspath(".."))

from data.dataset_factory import get_dataset_generator
from data.data_generators.sourcecodeplag_dataset_gen import original_plag_triplet_generator
from preprocessing.embedding_chunks import get_ready_to_embed_chunks
from preprocessing.context_chunker import safe_get_ready_to_embed_context_chunks
from preprocessing.mean_pool_chunks import mean_pool_chunks
from preprocessing.block_splitter import deverbose_ast
from visualizer.smoothing import smooth_embeddings, smooth_multiple_embeddings

In [None]:
gen = get_dataset_generator(
    dataset_name='sourcecodeplag',
    mode='plagiarized',
    **{}
)

In [None]:
unixcoder = "microsoft/unixcoder-base"
unixoder_pipe = pipeline("feature-extraction", model=unixcoder)

codebert = "microsoft/codebert-base"
codebert_pipe = pipeline("feature-extraction", model=codebert)

graphcodebert = "microsoft/graphcodebert-base"
graphcodebert_pipe = pipeline("feature-extraction", model=graphcodebert)

models = {
    "UniXcoder": unixoder_pipe,
    "CodeBERT": codebert_pipe,
    "GraphCodeBERT": graphcodebert_pipe
}

In [None]:
@dataclass
class Embeddings:
    """
    Represents a single fully embedded chunk
    """
    code: str
    ast: str
    combined: str

In [None]:
def embed(chunks, model, combine_strategy: callable):
    """
    returns all chunks as embeddings
    """
    embeddings_list: [Embeddings] = []

    for chunk in chunks:
        # 1 = code, 2 = ast
        code = pipe(chunk[1])
        ast = pipe(chunk[2])
        embeddings_list.append(Embeddings(code=code, ast=ast, combined=combine_strategy(code, ast)))
    
    return embeddings_list

In [None]:
def mean_pool(vec1, vec2):
    v1 = np.array(vec1, dtype=float)
    v2 = np.array(vec2, dtype=float)

    if v1.shape != v2.shape:
        raise ValueError("Vectors must have the same length")

    pooled = (v1 + v2) / 2.0
    return pooled.tolist()

In [None]:
def process_sample(sample):
    """
    Takes single sample and processes it using multiple chunking strategies and a way for different poooling straegies
    """
    code_a = sample.code_a
    line_chunks_a = get_ready_to_embed_chunks(code_a)
    context_chunks_a = safe_get_ready_to_embed_context_chunks(code_a)

    code_b = sample.code_b
    line_chunks_b = get_ready_to_embed_chunks(code_b)
    context_chunks_b = safe_get_ready_to_embed_context_chunks(code_b)

    # Mean pooling strategy
    for model in models:
        line_embeddings_a = embed(line_chunks_a, model, mean_pool)
        contenxt_embeddings_a = embed(context_chunks_a, model, mean_pool)

        line_embeddings_b = embed(line_chunks_b, model, mean_pool)
        contenxt_embeddings_b = embed(context_chunks_b, model, mean_pool)
    
    

In [None]:
for sample in gen:
    process_sample()