In [2]:
import os
import re
import torch
import pandas as pd
import numpy as np
import keyword
from transformers import RobertaTokenizer, RobertaModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Leer dataset
df = pd.read_csv("pares.csv")
lista_de_todos_los_archivos = pd.concat([df["codigo_1"], df["codigo_2"]]).unique().tolist()

# === Inicializar CodeBERT ===
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='word')
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base", output_hidden_states=True)

def anonymizar_codigo(code):
    palabras_reservadas = set(keyword.kwlist)
    tokens = re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', code)

    usados = {}
    nuevo_codigo = code
    contador = 1

    for tok in tokens:
        # Evita palabras reservadas y ya usadas
        if tok not in usados and tok not in palabras_reservadas:
            usados[tok] = f"VAR_{contador}"
            contador += 1

    # Sustituye solo tokens enteros (usando límites de palabra)
    for original, nuevo in usados.items():
        nuevo_codigo = re.sub(rf'\b{re.escape(original)}\b', nuevo, nuevo_codigo)

    return nuevo_codigo

# Extraer función principal del archivo
def extraer_funcion_principal(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        code = f.read()
    funciones = re.findall(r"(def\s+[a-zA-Z_][a-zA-Z0-9_]*\(.*?\):(?:\n(?:\s{4}|\t).*)*)", code)
    return funciones[0] if funciones else code  # Si no hay funciones, usar todo

# Embedding con CodeBERT
def get_embedding(text, capa=4):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    hidden_states = outputs.hidden_states  # Lista de 13 tensores (incluye embedding inicial)
    capa_oculta = hidden_states[capa]      # Selecciona la capa que quieras

    # Tomamos el vector del token [CLS] de esa capa
    return capa_oculta[:, 0, :].squeeze().numpy()

def cargar_codigo(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

# Entrenar TF-IDF con todo el corpus primero (usa todos los códigos)
corpus = [cargar_codigo(path) for path in lista_de_todos_los_archivos]
vectorizer.fit(corpus)

# Función para vector TF-IDF
def get_tfidf_vector(text):
    return vectorizer.transform([text]).toarray()[0]

# === Función para anonimizar código ===
def anonimizar_codigo(code):
    tokens = re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', code)
    usados = {}
    nuevo_codigo = code
    contador = 1
    for tok in tokens:
        if tok not in usados and tok not in {"def", "if", "else", "for", "while", "return", "print", "input"}:
            usados[tok] = f"VAR_{contador}"
            contador += 1
    for original, nuevo in usados.items():
        nuevo_codigo = re.sub(rf'\b{original}\b', nuevo, nuevo_codigo)
    return nuevo_codigo

# === Extraer todas las funciones de un archivo ===
def extraer_todas_funciones(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        code = f.read()
    funciones = re.findall(r"(def\s+[a-zA-Z_][a-zA-Z0-9_]*\(.*?\):(?:\n(?:\s{4}|\t).*)*)", code)
    return funciones if funciones else [code]

# Función para vector combinado
def get_combined_vector(code1, code2):
    anon1 = anonimizar_codigo(code1)
    anon2 = anonimizar_codigo(code2)

    tfidf1 = get_tfidf_vector(anon1)
    tfidf2 = get_tfidf_vector(anon2)
    emb1 = get_embedding(anon1, capa=4)
    emb2 = get_embedding(anon2, capa=4)

    vector = np.concatenate([
        tfidf1, tfidf2, np.abs(tfidf1 - tfidf2),
        emb1, emb2, np.abs(emb1 - emb2)
    ])
    return vector


code1 = cargar_codigo("dataset/par-1-a-juego-puntuacion.py")
code2 = cargar_codigo("dataset/par-1-b-juego-puntuacion.py")
vector = get_combined_vector(code1, code2)

print(f"Vector combinado: {vector.shape}")


Vector combinado: (113253,)


In [None]:
import Levenshtein
def comparar_funciones_combinado(path1, path2):
    funciones_1 = extraer_todas_funciones(path1)
    funciones_2 = extraer_todas_funciones(path2)

    print(f"🔍 {len(funciones_1)} funciones en {path1}")
    print(f"🔍 {len(funciones_2)} funciones en {path2}")

    # Anonimizar funciones
    funcs_anon_1 = [anonimizar_codigo(f) for f in funciones_1]
    funcs_anon_2 = [anonimizar_codigo(f) for f in funciones_2]

    # TF-IDF
    tfidf_1 = [get_tfidf_vector(f) for f in funcs_anon_1]
    tfidf_2 = [get_tfidf_vector(f) for f in funcs_anon_2]

    # CodeBERT
    emb_1 = [get_embedding(f, capa=1) for f in funcs_anon_1]
    emb_2 = [get_embedding(f, capa=1) for f in funcs_anon_2]

    similitudes = []
    for i, (e1, t1) in enumerate(zip(emb_1, tfidf_1)):
        fila = []
        for j, (e2, t2) in enumerate(zip(emb_2, tfidf_2)):
            sim_bert = cosine_similarity([e1], [e2])[0][0]
            sim_tfidf = cosine_similarity([t1], [t2])[0][0]
            sim_combinada = 0.3 * sim_bert + 0.7 * sim_tfidf
            fila.append(sim_combinada)
            print(f"Sim f{i} vs f{j}: TF-IDF={sim_tfidf:.4f}, CodeBERT={sim_bert:.4f}, Combinada={sim_combinada:.4f}")
        similitudes.append(fila)

    return np.array(similitudes)

def features_por_par(path1, path2):
    if not os.path.exists(path1) or not os.path.exists(path2):
        return None

    funcs1 = extraer_todas_funciones(path1)
    funcs2 = extraer_todas_funciones(path2)

    # Anonimizamos funciones
    funcs1_anon = [anonimizar_codigo(f) for f in funcs1]
    funcs2_anon = [anonimizar_codigo(f) for f in funcs2]

    # Embeddings con CodeBERT (por función)
    emb1 = [get_embedding(f, capa=1) for f in funcs1_anon]
    emb2 = [get_embedding(f, capa=1) for f in funcs2_anon]

    if not emb1 or not emb2:
        return None
       
    # TF-IDF similarity (cosine) entre todo el código de cada archivo
    all_code1 = " ".join(funcs1_anon)
    all_code2 = " ".join(funcs2_anon)
    
    lev_sim = 1 - Levenshtein.distance(all_code1, all_code2) / max(len(all_code1), len(all_code2))
    len_total_1 = len(all_code1.split())
    len_total_2 = len(all_code2.split())
    len_diff = abs(len_total_1 - len_total_2)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([all_code1, all_code2])
    sim_tfidf = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

    # Similitud de Jaccard (a nivel de tokens)
    tokens1 = set(all_code1.split())
    tokens2 = set(all_code2.split())

    inter = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)
    sim_jaccard = len(inter) / len(union) if union else 0.0
    
    # Cosine similarity máxima entre pares de funciones (CodeBERT)
    sim_cos_max = max(cosine_similarity([e1], [e2])[0][0] for e1 in emb1 for e2 in emb2)

    # O si quieres algo más balanceado:
    maximos_1_a_2 = [max(cosine_similarity([e1], [e2])[0][0] for e2 in emb2) for e1 in emb1]
    maximos_2_a_1 = [max(cosine_similarity([e2], [e1])[0][0] for e1 in emb1) for e2 in emb2]
    sim_cos_avg = sum(maximos_1_a_2 + maximos_2_a_1) / (len(maximos_1_a_2) + len(maximos_2_a_1))


    return [sim_cos_avg,sim_cos_max,sim_tfidf, sim_jaccard, lev_sim, len_total_1, len_total_2, len_diff]

if __name__ == "__main__":
    archivo_a = "Data/Cipher_algorithms/hill_cipher/hill_cipher.py"
    archivo_b = "Data/models/XGB/XGB.py"
    
    # Entrenar TF-IDF antes de comparar
    df = pd.read_csv("pares.csv")
    lista_archivos = pd.concat([df["codigo_1"], df["codigo_2"]]).unique().tolist()
    corpus = [cargar_codigo(p) for p in lista_archivos]
    vectorizer.fit(corpus)

    matriz = comparar_funciones_combinado(archivo_a, archivo_b)
    
    arr_feat = features_por_par(archivo_a,archivo_b)
    
    print(arr_feat)

    print("\n🧾 Matriz de similitud combinada:")
    print(np.round(matriz, 4))



🔍 1 funciones en Data/Cipher_algorithms/hill_cipher/hill_cipher.py
🔍 1 funciones en Data/models/XGB/XGB.py
Sim f0 vs f0: TF-IDF=0.4785, CodeBERT=1.0000, Combinada=0.6350
[np.float32(0.99997646), np.float32(0.99997646), np.float64(0.7903247924281183), 0.09375, 0.45807127882599585, 169, 89, 80]

🧾 Matriz de similitud combinada:
[[0.635]]


In [57]:
X, y = [], []
for _, row in df.iterrows():
        path1, path2, label = row["codigo_1"], row["codigo_2"], row["tipo_plagio"]
        feats = features_por_par(path1, path2)
        if feats is not None:
            X.append(feats)
            y.append(label)
            print("feats: ", feats)
            print("label", label)

feats:  [-0.0, np.float64(-0.24088039994239807), 2, 2, 2, 2, np.float64(1.0000000000000007)]
label 1
feats:  [-0.24622848629951477, np.float64(-0.582899197936058), 0, 0, 4, 4, np.float64(0.8467201372922605)]
label 4
feats:  [-0.27711865305900574, np.float64(-0.3260754644870758), 0, 0, 2, 1, np.float64(0.4005012247747568)]
label 4
feats:  [-0.1023884043097496, np.float64(-0.1023884043097496), 0, 0, 1, 1, np.float64(0.8187214624245548)]
label 3


KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter
from scipy.spatial.distance import euclidean

# Asegúrate de tener definidas estas funciones (ya las tienes):
# - anonimizar_codigo()
# - extraer_todas_funciones()
# - obtener_embedding()

def features_por_par(path1, path2):
    if not os.path.exists(path1) or not os.path.exists(path2):
        return None

    funcs1 = extraer_todas_funciones(path1)
    funcs2 = extraer_todas_funciones(path2)

    # Anonimizamos funciones
    funcs1_anon = [anonimizar_codigo(f) for f in funcs1]
    funcs2_anon = [anonimizar_codigo(f) for f in funcs2]

    # Embeddings con CodeBERT (por función)
    emb1 = [get_embedding(f, capa=4) for f in funcs1_anon]
    emb2 = [get_embedding(f, capa=4) for f in funcs2_anon]

    if not emb1 or not emb2:
        return None

    # Distancias Euclidianas negativas entre todas las combinaciones
    distancias = [-euclidean(e1, e2) for e1 in emb1 for e2 in emb2]
    
    max_sim = max(distancias)
    avg_sim = np.mean(distancias)
    count_95 = sum(1 for s in distancias if s > -0.05)
    count_90 = sum(1 for s in distancias if s > -0.1)

    # TF-IDF similarity (cosine) entre todo el código de cada archivo
    all_code1 = " ".join(funcs1_anon)
    all_code2 = " ".join(funcs2_anon)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([all_code1, all_code2])
    sim_tfidf = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

    return [max_sim, avg_sim, count_95, count_90, len(emb1), len(emb2), sim_tfidf]


def entrenar_modelo_funcion_por_funcion(csv_path):
    df = pd.read_csv(csv_path)
    X, y = [], []

    for _, row in df.iterrows():
        path1, path2, label = row["codigo_1"], row["codigo_2"], row["tipo_plagio"]
        feats = features_por_par(path1, path2)
        if label != 4:
            if feats is not None:
                X.append(feats)
                y.append(label)

    print(f"Pares válidos: {len(X)}")
    print("Distribución:", Counter(y))

    if len(set(y)) > 1:
        try:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
        except ValueError:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

        clf = LogisticRegression(class_weight='balanced', max_iter=1000)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        print(classification_report(y_test, y_pred))
    else:
        print("⚠️ Solo hay una clase en el dataset. No se puede entrenar.")

entrenar_modelo_funcion_por_funcion("pares.csv")

ValueError: Found array with dim 3. check_pairwise_arrays expected <= 2.

In [3]:
pip install python-Levenshtein


Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-win_amd64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-win_amd64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-win_amd64.whl (100 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 1.6/1.6 MB 17.5 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, Levenshtein, python-Levenshtein

   ---------------------------------------- 0/3 [rapidfuzz]
   ---------------------------------------- 3/3 [python-Levenshtein]

Successfully installed Levenshtein-0.27.1 python-Levenshtein

In [17]:
import pandas as pd
import numpy as np
import os
import Levenshtein
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter
from scipy.spatial.distance import euclidean


# Asegúrate de tener definidas estas funciones (ya las tienes):
# - anonimizar_codigo()
# - extraer_todas_funciones()
# - obtener_embedding()

def features_por_par(path1, path2):
    if not os.path.exists(path1) or not os.path.exists(path2):
        return None

    funcs1 = extraer_todas_funciones(path1)
    funcs2 = extraer_todas_funciones(path2)

    # Anonimizamos funciones
    funcs1_anon = [anonimizar_codigo(f) for f in funcs1]
    funcs2_anon = [anonimizar_codigo(f) for f in funcs2]

    # Embeddings con CodeBERT (por función)
    emb1 = [get_embedding(f, capa=4) for f in funcs1_anon]
    emb2 = [get_embedding(f, capa=4) for f in funcs2_anon]

    if not emb1 or not emb2:
        return None
       
    # TF-IDF similarity (cosine) entre todo el código de cada archivo
    all_code1 = " ".join(funcs1_anon)
    all_code2 = " ".join(funcs2_anon)
    
    lev_sim = 1 - Levenshtein.distance(all_code1, all_code2) / max(len(all_code1), len(all_code2))
    len_total_1 = len(all_code1.split())
    len_total_2 = len(all_code2.split())
    len_diff = abs(len_total_1 - len_total_2)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([all_code1, all_code2])
    sim_tfidf = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

    # Similitud de Jaccard (a nivel de tokens)
    tokens1 = set(all_code1.split())
    tokens2 = set(all_code2.split())

    inter = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)
    sim_jaccard = len(inter) / len(union) if union else 0.0
    
    # Cosine similarity máxima entre pares de funciones (CodeBERT)
    sim_cos_max = max(cosine_similarity([e1], [e2])[0][0] for e1 in emb1 for e2 in emb2)

    # O si quieres algo más balanceado:
    maximos_1_a_2 = [max(cosine_similarity([e1], [e2])[0][0] for e2 in emb2) for e1 in emb1]
    maximos_2_a_1 = [max(cosine_similarity([e2], [e1])[0][0] for e1 in emb1) for e2 in emb2]
    sim_cos_avg = sum(maximos_1_a_2 + maximos_2_a_1) / (len(maximos_1_a_2) + len(maximos_2_a_1))


    return [sim_cos_avg,sim_cos_max,sim_tfidf, sim_jaccard, lev_sim, len_total_1, len_total_2, len_diff]


def entrenar_modelo_funcion_por_funcion(csv_path):
    df = pd.read_csv(csv_path)
    X, y = [], []

    for _, row in df.iterrows():
        path1, path2, label = row["codigo_1"], row["codigo_2"], row["tipo_plagio"]
        feats = features_por_par(path1, path2)
        if label != 4:
            if feats is not None:
                X.append(feats)
                y.append(label)

    print(f"Pares válidos: {len(X)}")
    print("Distribución:", Counter(y))

    if len(set(y)) > 1:
        try:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
        except ValueError:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

        clf = LogisticRegression(
            max_iter=1000,
            multi_class='multinomial',
            solver='lbfgs'
        )
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        print(classification_report(y_test, y_pred))
    else:
        print("⚠️ Solo hay una clase en el dataset. No se puede entrenar.")


entrenar_modelo_funcion_por_funcion("pares_copy.csv")

Pares válidos: 270
Distribución: Counter({3: 91, 2: 91, 1: 88})




              precision    recall  f1-score   support

           1       0.64      0.67      0.65        27
           2       0.39      0.33      0.36        27
           3       0.53      0.59      0.56        27

    accuracy                           0.53        81
   macro avg       0.52      0.53      0.53        81
weighted avg       0.52      0.53      0.53        81



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
