In [9]:
import pandas as pd
import random
from collections import defaultdict

# cargar dataset
df = pd.read_csv("dataset.csv")
total_rows = len(df)

# unión‑find para agrupar cada archivo en su componente
parent = {}
def find(x):
    parent.setdefault(x, x)
    if parent[x] != x:
        parent[x] = find(parent[x])
    return parent[x]

def union(a, b):
    ra, rb = find(a), find(b)
    if ra != rb:
        parent[rb] = ra

for a, b in zip(df["idcode1"], df["idcode2"]):
    union(a, b)

# filas agrupadas por componente conexo
comp_rows = defaultdict(list)
for idx, (a, _) in enumerate(zip(df["idcode1"], df["idcode2"])):
    comp_rows[find(a)].append(idx)

components = list(comp_rows.items())
random.Random(42).shuffle(components)  # reproducible

# metas de tamaño para 60 / 20 / 20
target_train = int(total_rows * 0.60)
target_test  = int(total_rows * 0.20)
target_val   = total_rows - target_train - target_test

splits = {"train": [], "test": [], "val": []}
split_sizes = {"train": 0, "test": 0, "val": 0}

# asignar componentes al split con mayor cupo disponible
for comp_id, rows in components:
    remaining = {
        "train": target_train - split_sizes["train"],
        "test":  target_test  - split_sizes["test"],
        "val":   target_val   - split_sizes["val"],
    }
    candidates = [s for s, r in remaining.items() if r > 0] or ["train"]
    chosen = max(candidates, key=lambda s: remaining[s])
    splits[chosen].extend(rows)
    split_sizes[chosen] += len(rows)

# construir dataframes finales
train_df = df.loc[splits["train"]].reset_index(drop=True)
test_df  = df.loc[splits["test"] ].reset_index(drop=True)
val_df   = df.loc[splits["val"]  ].reset_index(drop=True)

# verificación de que no haya archivos en más de un split
train_codes = set(train_df["idcode1"]).union(train_df["idcode2"])
test_codes  = set(test_df["idcode1"]).union(test_df["idcode2"])
val_codes   = set(val_df["idcode1"]).union(val_df["idcode2"])

assert train_codes.isdisjoint(test_codes)
assert train_codes.isdisjoint(val_codes)
assert test_codes.isdisjoint(val_codes)

# guardar archivos
train_df.to_csv("train.csv", index=False, quoting=1)
test_df.to_csv("test.csv",  index=False, quoting=1)
val_df.to_csv("val.csv",   index=False, quoting=1)

print(f"Tamaños finales: {len(train_df)}, {len(test_df)}, {len(val_df)}")


Tamaños finales: 907, 332, 299


In [10]:
import pandas as pd
import re

df = pd.read_csv("test.csv")
ir_plag_df = df[df["dataset"].str.contains("IR-Plag", na=False)].copy()

# Función para extraer el nivel de plagio
def extract_level(idcode2):
    # Buscar patrones como "-L1-", "-L2-", ..., "-L6-"
    match = re.search(r"-L([1-6])-", idcode2)
    if match:
        return int(match.group(1))
    elif "-NP-" in idcode2:
        return 0
    return -1

# Aplicar la función a la columna 'idcode2'
ir_plag_df["plagiarism_level"] = ir_plag_df["idcode2"].apply(extract_level)
ir_plag_df.to_csv("plagiarism_levels.csv", index=False, quoting=1)

print(f"Guardado 'plagiarism_levels.csv' con {len(ir_plag_df)} filas y columna 'plagiarism_level'")


Guardado 'plagiarism_levels.csv' con 135 filas y columna 'plagiarism_level'
