In [None]:
%pip install javalang
%pip install simhash
%pip install apted

In [None]:
import re, numpy as np, pandas as pd, javalang
from simhash import Simhash
from apted import APTED, Config                       # ← reemplazo de zss
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from tensorflow.keras.layers import Input, Conv1D, GlobalMaxPooling1D, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

In [None]:
# SimHash
def simhash_bits(code: str, nbits: int = 64) -> int:
    tokens = code.strip().split()
    return Simhash(tokens, f=nbits).value   # entero de nbits

def simhash_sim(c1: str, c2: str, nbits: int = 64) -> float:
    h1, h2 = simhash_bits(c1, nbits), simhash_bits(c2, nbits)
    dist = bin(h1 ^ h2).count("1")
    return 1 - dist / nbits

In [None]:
def java_ast(code: str):
    tree = javalang.parse.parse(code)
    def to_node(node):
        lbl = type(node).__name__
        kids = []
        for _, v in node:
            if isinstance(v, javalang.ast.Node):
                kids.append(to_node(v))
            elif isinstance(v, list):
                kids += [to_node(k) for k in v if isinstance(k, javalang.ast.Node)]
        return Node(lbl, kids)
    return to_node(tree)

class Node:                    # nodo simple para APTED
    __slots__ = ("label", "children")
    def __init__(self, lbl, kids=None):
        self.label = lbl
        self.children = kids or []

# Configuración Zhang‑Shasha
class ZhangConfig(Config):
    def rename(self, a, b):     return 0 if a.label == b.label else 1
    def children(self, node):   return node.children
    def insert(self, node):     return 1
    def delete(self, node):     return 1

def ast_sim(c1: str, c2: str) -> float:
    t1, t2 = java_ast(c1), java_ast(c2)
    dist   = APTED(t1, t2, ZhangConfig()).compute_edit_distance()
    base   = max(t1.size if hasattr(t1,'size') else count_nodes(t1),
                 t2.size if hasattr(t2,'size') else count_nodes(t2))
    return 1 - dist / base

def count_nodes(n):             # tamaño para normalizar
    return 1 + sum(count_nodes(c) for c in n.children)

In [None]:
def pair_feats(row):
    try:
        return pd.Series({
            'simhash_sim': simhash_sim(row.code1_sanitized, row.code2_sanitized),
            'ast_sim'    : ast_sim    (row.code1_sanitized, row.code2_sanitized)
        })
    except Exception as e:
        # si algo falla (memoria, parseo, etc.) lo marcamos con -1
        print("fallo en fila", row.name, "→", e)
        return pd.Series({'simhash_sim': -1.0, 'ast_sim': -1.0})


In [None]:
train_df = pd.read_csv("train.csv")
val_df   = pd.read_csv("val.csv")
test_df  = pd.read_csv("test.csv")

for df_, tag in [(train_df,'train'), (val_df,'validation'), (test_df,'test')]:
    df_["split"] = tag
df = pd.concat([train_df, val_df, test_df], ignore_index=True)
assert {'code1_sanitized','code2_sanitized','code1_vecMark','code2_vecMark','result'}.issubset(df.columns)

df[['simhash_sim','ast_sim']] = df.apply(pair_feats, axis=1)

In [None]:
def pad_vecs(col, maxlen=200):
    vecs = df[col].apply(lambda x: np.array(eval(x)))
    return pad_sequences(vecs, maxlen=maxlen, padding='post', dtype='float32')

MAX_LEN = 200
x1 = np.expand_dims(pad_vecs("code1_vecMark", MAX_LEN), -1)
x2 = np.expand_dims(pad_vecs("code2_vecMark", MAX_LEN), -1)
f_vec = df[['simhash_sim','ast_sim']].values.astype('float32')
y      = df.result.values
splits = df.split.values

sel = splits == 'train'
x1_train,x2_train,f_train,y_train = x1[sel],x2[sel],f_vec[sel],y[sel]
sel = splits == 'validation'
x1_val,x2_val,f_val,y_val = x1[sel],x2[sel],f_vec[sel],y[sel]
sel = splits == 'test'
x1_test,x2_test,f_test,y_test = x1[sel],x2[sel],f_vec[sel],y[sel]

In [None]:
from tensorflow.keras.optimizers import Adam

def branch(inp_shape):
    inp = Input(shape=inp_shape)
    x   = Conv1D(128, 5, activation='relu', padding='same')(inp)
    x   = GlobalMaxPooling1D()(x)
    return inp, x

in1, out1 = branch((MAX_LEN,1))
in2, out2 = branch((MAX_LEN,1))
in_feat   = Input(shape=(2,))
feat_dense= Dense(32, activation='relu')(in_feat)

merged = Concatenate()([out1, out2, feat_dense])
x = Dense(128, activation='relu')(merged)
output = Dense(1, activation='sigmoid')(x)

model = Model([in1,in2,in_feat], output)
model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit([x1_train,x2_train,f_train], y_train,
          validation_data=([x1_val,x2_val,f_val], y_val),
          epochs=50, batch_size=32)

In [None]:
loss, acc = model.evaluate([x1_test, x2_test, f_test], y_test)
print(f"\nTest accuracy: {acc:.4f}")

In [None]:
loss, acc = model.evaluate([x1_test,x2_test,f_test], y_test)
print(f"\nTest accuracy: {acc:.4f}")

y_pred = (model.predict([x1_test,x2_test,f_test]) > 0.5).astype(int).flatten()
print("\nReporte de clasificación:\n",
      classification_report(y_test, y_pred, target_names=["Original","Plagiado"]))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=["Original","Plagiado"])
fig, ax = plt.subplots(figsize=(6,6)); disp.plot(ax=ax, cmap="Blues", values_format='d')
plt.title("Matriz de Confusión"); plt.tight_layout(); plt.show()