In [5]:
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

In [61]:
# Preprocessing function
def preprocess_java_code(code):
    # Remove single-line comments
    code = re.sub(r'//.*', '', code)
    # Remove multi-line comments
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    # Remove extra whitespace and line breaks
    code = re.sub(r'\s+', ' ', code).strip()
    return code

In [37]:
# Path del dataset
base_path = os.getcwd()
codes_path = './Versions/version_2'
base_path = os.path.realpath(base_path + codes_path)
df = pd.DataFrame()

In [63]:
# Lista de carpetas en el dataset original
folder_group = [name for name in os.listdir(codes_path) if os.path.isdir(os.path.join(codes_path, name))]
print(folder_group[:5])

['0017d438_9852706b', '0017d438_ac180326', '0048a372_0adb1ee5', '00af3420_5449d33c', '00af3420_86102d81']


In [64]:
data_list = []
for folder in folder_group:
    data_path = os.path.join(codes_path, folder)
    files = os.listdir(data_path)
    
    for i in range(len(files)):
        for j in range(i + 1, len(files)):
            file1 = os.path.join(data_path, files[i])
            file2 = os.path.join(data_path, files[j])
            
            with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
                code1 = f1.read()
                code2 = f2.read()
                
                # Preprocess the Java code
                code1 = preprocess_java_code(code1)
                code2 = preprocess_java_code(code2)
                
                data_list.append({'name_file_1': files[i], 'java_code_1': code1, 'name_file_2': files[j], 'java_code_2': code2})

# Convertir la lista de diccionarios en un DataFrame
df = pd.DataFrame(data_list)

In [22]:
def cosine_similarity(code1, code2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([code1, code2])
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return similarity[0][0]

In [66]:
df['similarity'] = df.apply(lambda row: cosine_similarity(row['java_code_1'], row['java_code_2']), axis=1)


In [67]:
print(df.info())
print(df.describe())
print(df.columns)
print(df.dtypes)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name_file_1  911 non-null    object 
 1   java_code_1  911 non-null    object 
 2   name_file_2  911 non-null    object 
 3   java_code_2  911 non-null    object 
 4   similarity   911 non-null    float64
dtypes: float64(1), object(4)
memory usage: 35.7+ KB
None
       similarity
count  911.000000
mean     0.557237
std      0.193142
min      0.061245
25%      0.427955
50%      0.531331
75%      0.648113
max      1.000000
Index(['name_file_1', 'java_code_1', 'name_file_2', 'java_code_2',
       'similarity'],
      dtype='object')
name_file_1     object
java_code_1     object
name_file_2     object
java_code_2     object
similarity     float64
dtype: object


In [68]:
print("Estadísticas de la similitud entre los códigos:")
print("Media: ", round(df['similarity'].mean(), 4))
print("Desviación estándar: ", round(df['similarity'].std(), 4))
print("Máximo: ", round(df['similarity'].max(), 4))
print("Mínimo: ", round(df['similarity'].min(), 4))
print("Mediana: ", round(df['similarity'].median(), 4))

Estadísticas de la similitud entre los códigos:
Media:  0.5572
Desviación estándar:  0.1931
Máximo:  1.0
Mínimo:  0.0612
Mediana:  0.5313


In [69]:
labels = pd.read_csv('./Versions/labels.csv')
labels.rename(columns= {'sub1':'name_file_1', 'sub2':'name_file_2'}, inplace = True)
labels

Unnamed: 0,name_file_1,name_file_2,problem,verdict
0,0fd5b95a,6490bbe8,19,0
1,464a03b8,ff1fc018,20,0
2,3e6def38,548ffb07,14,1
3,0b91922c,71a4f6d2,6,0
4,9291ca83,d6fb3b9e,15,0
...,...,...,...,...
906,11c2ab99,28c2d81a,8,1
907,550335a3,6d7d5dd7,4,0
908,558df7d4,d8654140,17,0
909,3088ca9c,6f393cfe,15,1


In [74]:
df_temp= df.copy()
df_temp

Unnamed: 0,name_file_1,java_code_1,name_file_2,java_code_2,similarity
0,0017d438.java,import java.io.BufferedReader; import java.io....,9852706b.java,import java.io.BufferedReader; import java.io....,0.755271
1,0017d438.java,import java.io.BufferedReader; import java.io....,ac180326.java,import java.io.BufferedReader; import java.io....,0.579874
2,0048a372.java,import java.io.*; import java.util.*; public c...,0adb1ee5.java,import java.util.*; import java.io.*; public c...,0.417523
3,00af3420.java,import java.util.*; import java.io.*; public c...,5449d33c.java,import java.io.*; import java.util.*; import j...,0.521792
4,00af3420.java,import java.util.*; import java.io.*; public c...,86102d81.java,import java.io.*; import java.util.*; public c...,0.578590
...,...,...,...,...,...
906,eea69e7f.java,import java.util.*; public class Solution { pu...,f6ca6fc8.java,import java.util.*; import java.io.*; public c...,0.494697
907,f229aa7f.java,import java.util.*; import java.io.*; import j...,fcc7e8fa.java,import java.io.OutputStream; import java.io.IO...,0.338990
908,f28b8cb4.java,import java.io.BufferedReader; import java.io....,ff3283cf.java,import java.util.*; import java.lang.*; import...,0.590541
909,fadc1365.java,import java.io.PrintWriter; import java.io.Buf...,fdd85afb.java,import java.util.*; public class Solution{ pub...,0.588751


In [75]:
#Quitar la extensión ".java
df_temp['name_file_1'] = df_temp['name_file_1'].str.replace('.java', '')
df_temp['name_file_2'] = df_temp['name_file_2'].str.replace('.java', '')

In [76]:
df_labels = pd.merge(df_temp, labels, on=['name_file_1', 'name_file_2'], how='left')

In [77]:
df_labels

Unnamed: 0,name_file_1,java_code_1,name_file_2,java_code_2,similarity,problem,verdict
0,0017d438,import java.io.BufferedReader; import java.io....,9852706b,import java.io.BufferedReader; import java.io....,0.755271,9,1
1,0017d438,import java.io.BufferedReader; import java.io....,ac180326,import java.io.BufferedReader; import java.io....,0.579874,9,0
2,0048a372,import java.io.*; import java.util.*; public c...,0adb1ee5,import java.util.*; import java.io.*; public c...,0.417523,8,0
3,00af3420,import java.util.*; import java.io.*; public c...,5449d33c,import java.io.*; import java.util.*; import j...,0.521792,15,0
4,00af3420,import java.util.*; import java.io.*; public c...,86102d81,import java.io.*; import java.util.*; public c...,0.578590,15,0
...,...,...,...,...,...,...,...
906,eea69e7f,import java.util.*; public class Solution { pu...,f6ca6fc8,import java.util.*; import java.io.*; public c...,0.494697,19,0
907,f229aa7f,import java.util.*; import java.io.*; import j...,fcc7e8fa,import java.io.OutputStream; import java.io.IO...,0.338990,4,0
908,f28b8cb4,import java.io.BufferedReader; import java.io....,ff3283cf,import java.util.*; import java.lang.*; import...,0.590541,8,0
909,fadc1365,import java.io.PrintWriter; import java.io.Buf...,fdd85afb,import java.util.*; public class Solution{ pub...,0.588751,8,0


In [78]:
df_plagios_detectados = df_labels[df_labels['verdict'] == 1]
df_plagios_no_detectados = df_labels[df_labels['verdict'] == 0]

In [79]:
df_plagios_detectados.head()

Unnamed: 0,name_file_1,java_code_1,name_file_2,java_code_2,similarity,problem,verdict
0,0017d438,import java.io.BufferedReader; import java.io....,9852706b,import java.io.BufferedReader; import java.io....,0.755271,9,1
18,034030f3,import java.util.*; import java.io.*; public c...,bf992c91,import java.util.Arrays; import java.util.Scan...,0.59919,19,1
21,04df7bb8,import java.math.BigInteger; import java.sql.A...,1ea771ea,import java.io.*; import java.util.*; public c...,0.492231,8,1
22,04df7bb8,import java.math.BigInteger; import java.sql.A...,85125ecb,import java.io.*; import java.util.Arrays; imp...,0.516916,8,1
23,04df7bb8,import java.math.BigInteger; import java.sql.A...,aaccc000,import java.io.*; import java.util.*; public c...,0.536817,8,1


In [80]:
df_plagios_detectados.to_csv('codigos_plagio.csv', index=False)
df_plagios_no_detectados.to_csv('codigos_no_plagio.csv', index=False)

In [3]:
# Labeling data origining from the fire 14 dataset
# List of pairs of Java files that are plagiarized
plagiarized_pairs = [
    ("003.java", "004.java"),
    ("005.java", "006.java"),
    ("008.java", "010.java"),
    ("014.java", "021.java"),
    ("015.java", "023.java"),
    ("016.java", "024.java"),
    ("017.java", "022.java"),
    ("030.java", "032.java"),
    ("033.java", "034.java"),
    ("042.java", "044.java"),
    ("043.java", "251.java"),
    ("045.java", "047.java"),
    ("048.java", "051.java"),
    ("048.java", "183.java"),
    ("048.java", "257.java"),
    ("048.java", "258.java"),
    ("049.java", "050.java"),
    ("051.java", "183.java"),
    ("051.java", "257.java"),
    ("051.java", "258.java"),
    ("052.java", "053.java"),
    ("059.java", "159.java"),
    ("059.java", "183.java"),
    ("059.java", "250.java"),
    ("059.java", "258.java"),
    ("061.java", "216.java"),
    ("062.java", "064.java"),
    ("069.java", "070.java"),
    ("078.java", "079.java"),
    ("084.java", "085.java"),
    ("086.java", "087.java"),
    ("086.java", "155.java"),
    ("086.java", "222.java"),
    ("086.java", "242.java"),
    ("086.java", "243.java"),
    ("087.java", "155.java"),
    ("087.java", "222.java"),
    ("087.java", "242.java"),
    ("087.java", "243.java"),
    ("089.java", "090.java"),
    ("094.java", "098.java"),
    ("101.java", "212.java"),
    ("103.java", "105.java"),
    ("106.java", "111.java"),
    ("107.java", "108.java"),
    ("107.java", "112.java"),
    ("107.java", "113.java"),
    ("108.java", "112.java"),
    ("108.java", "113.java"),
    ("112.java", "113.java"),
    ("117.java", "119.java"),
    ("131.java", "133.java"),
    ("135.java", "174.java"),
    ("136.java", "173.java"),
    ("137.java", "171.java"),
    ("140.java", "142.java"),
    ("143.java", "145.java"),
    ("146.java", "147.java"),
    ("148.java", "150.java"),
    ("153.java", "155.java"),
    ("153.java", "222.java"),
    ("155.java", "222.java"),
    ("155.java", "243.java"),
    ("158.java", "161.java"),
    ("159.java", "250.java"),
    ("175.java", "180.java"),
    ("181.java", "182.java"),
    ("183.java", "185.java"),
    ("183.java", "258.java"),
    ("185.java", "258.java"),
    ("188.java", "190.java"),
    ("191.java", "193.java"),
    ("195.java", "218.java"),
    ("201.java", "209.java"),
    ("202.java", "208.java"),
    ("211.java", "216.java"),
    ("221.java", "224.java"),
    ("228.java", "230.java"),
    ("232.java", "233.java"),
    ("235.java", "237.java"),
    ("238.java", "240.java"),
    ("242.java", "243.java"),
    ("244.java", "246.java"),
    ("257.java", "258.java"),
]


In [6]:
# Directory containing the Java files
java_dir = 'fire14-source-code-training-dataset/java/'

# List to store data for the CSV file
data = []

# Read each pair of files and create the data for the CSV
for file1, file2 in plagiarized_pairs:
    with open(os.path.join(java_dir, file1), 'r') as f1, open(os.path.join(java_dir, file2), 'r') as f2:
        code1 = f1.read()
        code2 = f2.read()
        data.append([file1, code1, file2, code2, 1])

In [7]:
# Create a DataFrame and save it as a CSV file
df_fire = pd.DataFrame(data, columns=['name_file_1', 'code_java_1', 'name_file_2', 'code_java_2', 'label'])
df_fire.to_csv('plagiarized_java_pairs.csv', index=False)

In [8]:
df_fire

Unnamed: 0,name_file_1,code_java_1,name_file_2,code_java_2,label
0,003.java,import java.io.*;\nimport java.util.*;\nimport...,004.java,import java.io.*;\nimport java.util.*;\nimport...,1
1,005.java,\n\n\nimport java.io.*;\nimport java.util.*;\n...,006.java,\n\n\nimport java.io.*;\nimport java.util.*;\n...,1
2,008.java,\n\nimport java.io.*;\nimport java.*;\n\npubli...,010.java,\n\nimport java.io.*;\nimport java.*;\nimport ...,1
3,014.java,\n\n\n\nimport java.util.*;\nimport java.net.*...,021.java,\n\n\n\nimport java.util.*;\nimport java.net.*...,1
4,015.java,\n\n\n\npublic class HoldSharedData\n{\n pr...,023.java,\n\n\n\npublic class HoldSharedData\n{\n pr...,1
...,...,...,...,...,...
79,235.java,\t\n\n\nimport java.io.*;\nimport java.net.*;\...,237.java,\t\n\n\nimport java.io.*;\nimport java.net.*;\...,1
80,238.java,\nimport java.util.*;\nimport java.io.*;\nimpo...,240.java,\nimport java.util.*;\nimport java.io.*;\nimpo...,1
81,242.java,import java.net.*;\nimport java.io.*;\n\n publ...,243.java,import java.net.*;\nimport java.io.*;\n\n publ...,1
82,244.java,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,246.java,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,1


In [9]:
# Using alternative metrics
import Levenshtein as lev

# Function to calculate Levenshtein Distance
def levenshtein_distance(code1, code2):
    return lev.distance(code1, code2)

In [13]:
df_fire['levenshtein_distance'] = df_fire.apply(lambda row: levenshtein_distance(row['code_java_1'], row['code_java_2']), axis=1)
print(df_fire[['levenshtein_distance', 'name_file_1', 'name_file_2', 'label']])

    levenshtein_distance name_file_1 name_file_2  label
0                   1400    003.java    004.java      1
1                     11    005.java    006.java      1
2                   1097    008.java    010.java      1
3                     15    014.java    021.java      1
4                      5    015.java    023.java      1
..                   ...         ...         ...    ...
79                  3176    235.java    237.java      1
80                   573    238.java    240.java      1
81                   724    242.java    243.java      1
82                  2032    244.java    246.java      1
83                  1883    257.java    258.java      1

[84 rows x 4 columns]


In [16]:
# Calculate the minimum, maximum, and average of the levenshtein_distance
min_distance = df_fire['levenshtein_distance'].min()
max_distance = df_fire['levenshtein_distance'].max()
avg_distance = df_fire['levenshtein_distance'].mean()

# Define the thresholds for each category
identical_threshold = 0
similar_threshold = 100
distant_threshold = 500

# Classify the levenshtein_distance into different categories
identical = df_fire[df_fire['levenshtein_distance'] <= identical_threshold]
similar = df_fire[(df_fire['levenshtein_distance'] > identical_threshold) & (df_fire['levenshtein_distance'] <= similar_threshold)]
distant = df_fire[df_fire['levenshtein_distance'] > similar_threshold]

# Print the minimum, maximum, and average of the levenshtein_distance
print("Minimum levenshtein_distance:", min_distance)
print("Maximum levenshtein_distance:", max_distance)
print("Average levenshtein_distance:", avg_distance)

# Print the number of pairs in each category
print("Identical pairs:", len(identical))
print("Similar pairs:", len(similar))
print("Distant pairs:", len(distant))

Minimum levenshtein_distance: 5
Maximum levenshtein_distance: 7209
Average levenshtein_distance: 1305.892857142857
Identical pairs: 0
Similar pairs: 14
Distant pairs: 70


In [18]:
# Function to interpret Levenshtein Distance
def interpret_distance(distance, code1, code2):
    length1 = len(code1)
    length2 = len(code2)
    avg_length = (length1 + length2) / 2
    if distance == 0:
        return "Identical"
    elif distance / avg_length < 0.1:
        return "Very Similar"
    elif distance / avg_length < 0.3:
        return "Similar"
    else:
        return "Different"

In [21]:
df_fire['interpretation'] = df_fire.apply(lambda row: interpret_distance(row['levenshtein_distance'], row['code_java_1'], row['code_java_2']), axis=1)
print(df_fire[['levenshtein_distance', 'interpretation', 'name_file_2', 'name_file_1']])

    levenshtein_distance interpretation name_file_2 name_file_1
0                   1400      Different    004.java    003.java
1                     11   Very Similar    006.java    005.java
2                   1097      Different    010.java    008.java
3                     15   Very Similar    021.java    014.java
4                      5   Very Similar    023.java    015.java
..                   ...            ...         ...         ...
79                  3176      Different    237.java    235.java
80                   573      Different    240.java    238.java
81                   724        Similar    243.java    242.java
82                  2032      Different    246.java    244.java
83                  1883      Different    258.java    257.java

[84 rows x 4 columns]


In [26]:
df_fire['cosine_similarity'] = df_fire.apply(lambda row: cosine_similarity(row['code_java_1'], row['code_java_2']), axis=1)

Generar .csv con pares de codigo de java con label 0 (no plagio) a partir de métricas de similitud de código fuente.

In [41]:
df_fire


Unnamed: 0,name_file_1,code_java_1,name_file_2,code_java_2,label,levenshtein_distance,interpretation,cosine_similarity
0,003.java,import java.io.*;\nimport java.util.*;\nimport...,004.java,import java.io.*;\nimport java.util.*;\nimport...,1,1400,Different,0.793327
1,005.java,\n\n\nimport java.io.*;\nimport java.util.*;\n...,006.java,\n\n\nimport java.io.*;\nimport java.util.*;\n...,1,11,Very Similar,0.999912
2,008.java,\n\nimport java.io.*;\nimport java.*;\n\npubli...,010.java,\n\nimport java.io.*;\nimport java.*;\nimport ...,1,1097,Different,0.700584
3,014.java,\n\n\n\nimport java.util.*;\nimport java.net.*...,021.java,\n\n\n\nimport java.util.*;\nimport java.net.*...,1,15,Very Similar,0.999258
4,015.java,\n\n\n\npublic class HoldSharedData\n{\n pr...,023.java,\n\n\n\npublic class HoldSharedData\n{\n pr...,1,5,Very Similar,1.000000
...,...,...,...,...,...,...,...,...
79,235.java,\t\n\n\nimport java.io.*;\nimport java.net.*;\...,237.java,\t\n\n\nimport java.io.*;\nimport java.net.*;\...,1,3176,Different,0.445479
80,238.java,\nimport java.util.*;\nimport java.io.*;\nimpo...,240.java,\nimport java.util.*;\nimport java.io.*;\nimpo...,1,573,Different,0.877545
81,242.java,import java.net.*;\nimport java.io.*;\n\n publ...,243.java,import java.net.*;\nimport java.io.*;\n\n publ...,1,724,Similar,0.870929
82,244.java,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,246.java,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,1,2032,Different,0.778517


In [39]:
import itertools
import random

# Get a list of all Java files in the directory
all_files = [f for f in os.listdir(base_path) if f.endswith('.java')]

# Generate all possible pairs of files
all_pairs = list(itertools.combinations(all_files, 2))

# Filter out pairs that are already labeled as plagiarized
non_plagiarized_pairs = [pair for pair in all_pairs if pair not in plagiarized_pairs]


In [40]:
# Thresholds for non-plagiarism
LEVENSHTEIN_THRESHOLD = 1700  # 
COSINE_SIMILARITY_THRESHOLD = 0.6 

# List to store data for the CSV file
data = []

In [44]:
# Select a subset of non-plagiarized pairs to create a manageable dataset
random.seed(42)  # For reproducibility
# Read each pair of files, calculate metrics, and create the data for the CSV
for file1, file2 in non_plagiarized_pairs:
    with open(os.path.join(base_path, file1), 'r') as f1, open(os.path.join(base_path, file2), 'r') as f2:
        code1 = f1.read()
        code2 = f2.read()
        lev_distance = levenshtein_distance(code1, code2)
        cos_similarity_value = cosine_similarity(code1, code2)
        if lev_distance > LEVENSHTEIN_THRESHOLD and cos_similarity_value < COSINE_SIMILARITY_THRESHOLD:
            data.append([file1, code1, file2, code2, lev_distance, cos_similarity_value, 0]) 

# Create a DataFrame and save it as a CSV file
df = pd.DataFrame(data, columns=['name_file_1', 'code_java_1', 'name_file_2', 'code_java_2', 'levenshtein_distance', 'cosine_similarity', 'label'])
df.to_csv('non_plagiarized_java_pairs.csv', index=False)