In [1]:
import os
import csv

base_path = 'IR-Plag-Dataset'
output_csv = 'dataset.csv'
rows = []

def read_file(path):
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read().replace('\n', '\\n').replace('\r', '')

for case in sorted(os.listdir(base_path)):
    case_path = os.path.join(base_path, case)
    if not os.path.isdir(case_path):
        continue

    dataset_name = 'IR-Plag'
    original_path = os.path.join(case_path, 'original')
    original_file = next(os.scandir(original_path)).path
    original_code = read_file(original_file)
    original_id = f'{case}-ORIG'

    nonplag_path = os.path.join(case_path, 'non-plagiarized')
    if os.path.exists(nonplag_path):
        for folder in sorted(os.listdir(nonplag_path)):
            folder_path = os.path.join(nonplag_path, folder)
            if not os.path.isdir(folder_path):
                continue
            file_path = next(os.scandir(folder_path)).path
            code = read_file(file_path)
            file_id = f'{case}-NP-{folder}'
            rows.append([f'{original_id}_{file_id}', original_id, original_code, file_id, code, 0, dataset_name])

    plag_path = os.path.join(case_path, 'plagiarized')
    if os.path.exists(plag_path):
        for level in sorted(os.listdir(plag_path)):
            level_path = os.path.join(plag_path, level)
            for folder in sorted(os.listdir(level_path)):
                folder_path = os.path.join(level_path, folder)
                if not os.path.isdir(folder_path):
                    continue
                file_path = next(os.scandir(folder_path)).path
                code = read_file(file_path)
                file_id = f'{case}-{level}-{folder}'
                rows.append([f'{original_id}_{file_id}', original_id, original_code, file_id, code, 1, dataset_name])

with open(output_csv, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    writer.writerow(['id', 'idcode1', 'code1', 'idcode2', 'code2', 'result', 'dataset'])
    writer.writerows(rows)


In [2]:
# Agregar conplag al CSV
import pandas as pd

conplag_base = 'conplag_version_2/versions'
conplag_code_dir = os.path.join(conplag_base, 'version_2')
conplag_labels = os.path.join(conplag_base, 'labels.csv')

df = pd.read_csv(conplag_labels)

for index, row in df.iterrows():
    sub1 = row['sub1']
    sub2 = row['sub2']
    verdict = row['verdict']
    codeid1 = str(sub1)
    codeid2 = str(sub2)
    folder = f'{codeid1}_{codeid2}'
    folder_path = os.path.join(conplag_code_dir, folder)

    file1_path = os.path.join(folder_path, f'{codeid1}.java')
    file2_path = os.path.join(folder_path, f'{codeid2}.java')

    if not os.path.exists(file1_path) or not os.path.exists(file2_path):
        continue

    code1 = read_file(file1_path)
    code2 = read_file(file2_path)
    pair_id = f'{codeid1}_{codeid2}'
    result = int(verdict)
    dataset_name = 'conplag_version_2'

    rows.append([pair_id, codeid1, code1, codeid2, code2, result, dataset_name])

with open(output_csv, 'a', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    for row in rows:
        if row[-1] == 'conplag_version_2':
            writer.writerow(row)


In [3]:
import os
import csv
import random

base_path = 'fire14-source-code-training-dataset'
java_path = os.path.join(base_path, 'java')
qrel_path = os.path.join(base_path, 'SOCO14-java.qrel')
output_csv = 'dataset.csv'
dataset_name = 'FIRE14'

def read_code(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read().replace('\n', '\\n').replace('\r', '')

# Cargar todos los nombres de archivos disponibles
all_files = [f for f in os.listdir(java_path) if f.endswith('.java')]
file_set = set(all_files)

# Leer pares positivos del archivo QREL
positive_pairs = set()
with open(qrel_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) >= 2:
            f1, f2 = parts[0], parts[1]
            f1 = f1 if f1.endswith('.java') else f1 + '.java'
            f2 = f2 if f2.endswith('.java') else f2 + '.java'
            if f1 in file_set and f2 in file_set:
                positive_pairs.add((f1, f2))

# Generar pares negativos (aleatorios que no estén en positivos)
positive_ids = list({f for pair in positive_pairs for f in pair})
negatives = set()
while len(negatives) < len(positive_pairs):
    f1, f2 = random.sample(positive_ids, 2)
    if f1 != f2 and (f1, f2) not in positive_pairs and (f2, f1) not in positive_pairs:
        negatives.add((f1, f2))

# Guardar ambos tipos en el CSV
with open(output_csv, 'a', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)

    # Agregar positivos
    for f1, f2 in positive_pairs:
        path1 = os.path.join(java_path, f1)
        path2 = os.path.join(java_path, f2)
        if os.path.exists(path1) and os.path.exists(path2):
            code1 = read_code(path1)
            code2 = read_code(path2)
            writer.writerow([f'{f1}_{f2}', f1, code1, f2, code2, 1, dataset_name])

    # Agregar negativos
    for f1, f2 in negatives:
        path1 = os.path.join(java_path, f1)
        path2 = os.path.join(java_path, f2)
        if os.path.exists(path1) and os.path.exists(path2):
            code1 = read_code(path1)
            code2 = read_code(path2)
            writer.writerow([f'{f1}_{f2}', f1, code1, f2, code2, 0, dataset_name])
