In [13]:
import pandas as pd
import random
from collections import defaultdict

# Cargo el CSV original
df = pd.read_csv("dataset.csv")
total_rows = len(df)

# Defino las funciones de unión-find (union by root, path compression)
parent = {}
def find(x):
    parent.setdefault(x, x)
    if parent[x] != x:
        parent[x] = find(parent[x])
    return parent[x]

def union(a, b):
    ra, rb = find(a), find(b)
    if ra != rb:
        parent[rb] = ra

# Preparar estructuras para ir guardando índices de train/test/val globales
global_splits = {"train": [], "test": [], "val": []}
global_split_sizes = {"train": 0, "test": 0, "val": 0}

print()
# Recorro CADA dataset por separado: IR-Plag, conplag_version_2, FIRE14
for ds_label in df["dataset"].unique():
    subset = df[df["dataset"] == ds_label].copy()
    n_subset = len(subset)
    if n_subset == 0:
        continue

    # Reinicio unión-find para este dataset
    parent.clear()
    for a, b in zip(subset["idcode1"], subset["idcode2"]):
        union(a, b)

    # Agrupo los índices de FILA por componente
    comp_rows = defaultdict(list)
    for idx in subset.index:
        root = find(df.at[idx, "idcode1"])
        comp_rows[root].append(idx)

    # Mezclo componentes con semilla fija
    components = list(comp_rows.items())
    random.Random(42).shuffle(components)

    # Objetivos 60/20/20 para este dataset
    target_train = int(n_subset * 0.60)
    target_test  = int(n_subset * 0.20)
    target_val   = n_subset - target_train - target_test

    local_splits = {"train": [], "test": [], "val": []}
    local_sizes  = {"train": 0, "test": 0, "val": 0}

    # Asigno cada componente al split cuyo "remaining" (objetivo - asignado) sea mayor
    # Incluso si ya superamos el objetivo en todos, elegimos el menos negativo
    for comp_id, rows in components:
        remaining = {
            "train": target_train - local_sizes["train"],
            "test":  target_test  - local_sizes["test"],
            "val":   target_val   - local_sizes["val"],
        }
        # Escojo siempre el split con mayor remaining (incluso si es negativo)
        chosen = max(remaining, key=lambda s: remaining[s])

        local_splits[chosen].extend(rows)
        local_sizes[chosen] += len(rows)

    # Una vez asignados localmente, incorporo a los splits globales
    for split_name in ("train", "test", "val"):
        global_splits[split_name].extend(local_splits[split_name])
        global_split_sizes[split_name] += local_sizes[split_name]

    # Imprimo estadísticas para este dataset
    train_count = local_sizes["train"]
    test_count  = local_sizes["test"]
    val_count   = local_sizes["val"]

    pct_train = (train_count / n_subset) * 100
    pct_test  = (test_count  / n_subset) * 100
    pct_val   = (val_count   / n_subset) * 100

    print(f"Dataset = {ds_label}")
    print(f"  Total pares en '{ds_label}': {n_subset}")
    print(f"    → train: {train_count} filas ({pct_train:.2f}%)")
    print(f"    → test : {test_count} filas ({pct_test:.2f}%)")
    print(f"    → val  : {val_count} filas ({pct_val:.2f}%)\n")

# Construyo los DataFrames finales
train_df = df.loc[global_splits["train"]].reset_index(drop=True)
test_df  = df.loc[global_splits["test"] ].reset_index(drop=True)
val_df   = df.loc[global_splits["val"]  ].reset_index(drop=True)

# Verificación de no solapamiento de códigos entre splits
train_codes = set(train_df["idcode1"]).union(train_df["idcode2"])
test_codes  = set(test_df["idcode1"]).union(test_df["idcode2"])
val_codes   = set(val_df["idcode1"]).union(val_df["idcode2"])

assert train_codes.isdisjoint(test_codes), "¡Error: overlap train/test!"
assert train_codes.isdisjoint(val_codes),  "¡Error: overlap train/val!"
assert test_codes.isdisjoint(val_codes),   "¡Error: overlap test/val!"

# Guardo cada CSV
train_df.to_csv("train.csv", index=False, quoting=1)
test_df.to_csv("test.csv",  index=False, quoting=1)
val_df.to_csv("val.csv",   index=False, quoting=1)

# Imprimo resumen global
train_count = len(train_df)
test_count  = len(test_df)
val_count   = len(val_df)

pct_train = (train_count / total_rows) * 100
pct_test  = (test_count  / total_rows) * 100
pct_val   = (val_count   / total_rows) * 100

print("=== Resumen global sobre TODOS los datasets ===")
print(f"Tamaños finales: train={train_count} ({pct_train:.2f}%), "
      f"test={test_count} ({pct_test:.2f}%), val={val_count} ({pct_val:.2f}%)")



Dataset = IR-Plag
  Total pares en 'IR-Plag': 920
    → train: 522 filas (56.74%)
    → test : 134 filas (14.57%)
    → val  : 264 filas (28.70%)

Dataset = conplag_version_2
  Total pares en 'conplag_version_2': 1820
    → train: 1112 filas (61.10%)
    → test : 356 filas (19.56%)
    → val  : 352 filas (19.34%)

Dataset = FIRE14
  Total pares en 'FIRE14': 336
    → train: 334 filas (99.40%)
    → test : 0 filas (0.00%)
    → val  : 2 filas (0.60%)

=== Resumen global sobre TODOS los datasets ===
Tamaños finales: train=1968 (63.98%), test=490 (15.93%), val=618 (20.09%)


In [14]:
import pandas as pd
import re

print("=== Diagnóstico de la columna dataset` ===")
print("Valores únicos (hasta 50):")
print(df["dataset"].dropna().unique()[:50], "\n")
print("Filas que contienen 'IR-Plag' (exacto):",
      df["dataset"].str.contains("IR-Plag", na=False).sum())
print("Filas que contienen 'IR-Plag' (case-insensitive):",
      df["dataset"].str.contains("IR-Plag", case=False, na=False).sum())
print("===========================================\n")

df = pd.read_csv("test.csv")
ir_plag_df = df[df["dataset"].str.contains("IR-Plag", na=False)].copy()

# Función para extraer el nivel de plagio de un único campo (idcode)
def extract_level(idcode):
    if pd.isna(idcode):
        return -1
    # Buscar patrones como "-L1-", "-L2-", ..., "-L6-"
    match = re.search(r"-L([1-6])-", idcode)
    if match:
        return int(match.group(1))
    elif "-NP-" in idcode:
        return 0
    return -1

# Función que primero intenta extraer de idcode2 y, si no hay resultado (> -1), usa idcode1
def get_plagiarism_level(row):
    level = extract_level(row.get("idcode2", ""))
    if level == -1:
        level = extract_level(row.get("idcode1", ""))
    return level

# Aplicar la función a cada fila
ir_plag_df["plagiarism_level"] = ir_plag_df.apply(get_plagiarism_level, axis=1)

# Guardar el resultado
ir_plag_df.to_csv("plagiarism_levels.csv", index=False, quoting=1)

print(f"Guardado 'plagiarism_levels.csv' con {len(ir_plag_df)} filas y columna 'plagiarism_level'")


=== Diagnóstico de la columna dataset` ===
Valores únicos (hasta 50):
['IR-Plag' 'conplag_version_2' 'FIRE14'] 

Filas que contienen 'IR-Plag' (exacto): 920
Filas que contienen 'IR-Plag' (case-insensitive): 920

Guardado 'plagiarism_levels.csv' con 134 filas y columna 'plagiarism_level'
