In [8]:
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

In [9]:
# Preprocessing function
def preprocess_java_code(code):
    # Remove single-line comments
    code = re.sub(r'//.*', '', code)
    
    # Remove multi-line comments
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    
    # Remove extra whitespace and line breaks
    code = re.sub(r'\s+', ' ', code).strip()
    
    return code

In [10]:
# Path del dataset
base_path = os.getcwd()
codes_path = './Versions/version_2'
base_path = os.path.realpath(base_path + codes_path)
df = pd.DataFrame()

In [13]:
# Lista de carpetas en el dataset original
folder_group = [name for name in os.listdir(codes_path) if os.path.isdir(os.path.join(codes_path, name))]
print(folder_group[:5])

['0017d438_9852706b', '0017d438_ac180326', '0048a372_0adb1ee5', '00af3420_5449d33c', '00af3420_86102d81']


In [14]:
data_list = []
for folder in folder_group:
    data_path = os.path.join(codes_path, folder)
    files = os.listdir(data_path)
    
    for i in range(len(files)):
        for j in range(i + 1, len(files)):
            file1 = os.path.join(data_path, files[i])
            file2 = os.path.join(data_path, files[j])
            
            with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
                code1 = f1.read()
                code2 = f2.read()
                
                # Preprocess the Java code
                code1 = preprocess_java_code(code1)
                code2 = preprocess_java_code(code2)
                
                data_list.append({'name_file_1': files[i], 'java_code_1': code1, 'name_file_2': files[j], 'java_code_2': code2})

# Convertir la lista de diccionarios en un DataFrame
df = pd.DataFrame(data_list)

In [15]:
def calculate_similarity(code1, code2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([code1, code2])
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return similarity[0][0]

In [16]:
df['similarity'] = df.apply(lambda row: calculate_similarity(row['java_code_1'], row['java_code_2']), axis=1)


In [23]:
print(df.info())
print(df.describe())
print(df.columns)
print(df.dtypes)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name_file_1  911 non-null    object 
 1   java_code_1  911 non-null    object 
 2   name_file_2  911 non-null    object 
 3   java_code_2  911 non-null    object 
 4   similarity   911 non-null    float64
dtypes: float64(1), object(4)
memory usage: 35.7+ KB
None
       similarity
count  911.000000
mean     0.557237
std      0.193142
min      0.061245
25%      0.427955
50%      0.531331
75%      0.648113
max      1.000000
Index(['name_file_1', 'java_code_1', 'name_file_2', 'java_code_2',
       'similarity'],
      dtype='object')
name_file_1     object
java_code_1     object
name_file_2     object
java_code_2     object
similarity     float64
dtype: object


In [27]:
print("Estadísticas de la similitud entre los códigos:")
print("Media: ", round(df['similarity'].mean(), 4))
print("Desviación estándar: ", round(df['similarity'].std(), 4))
print("Máximo: ", round(df['similarity'].max(), 4))
print("Mínimo: ", round(df['similarity'].min(), 4))
print("Mediana: ", round(df['similarity'].median(), 4))

Estadísticas de la similitud entre los códigos:
Media:  0.5572
Desviación estándar:  0.1931
Máximo:  1.0
Mínimo:  0.0612
Mediana:  0.5313


In [35]:
labels = pd.read_csv('./Versions/labels.csv')
labels.rename(columns= {'sub1':'name_file_1', 'sub2':'name_file_2'}, inplace = True)
labels

Unnamed: 0,name_file_1,name_file_2,problem,verdict
0,0fd5b95a,6490bbe8,19,0
1,464a03b8,ff1fc018,20,0
2,3e6def38,548ffb07,14,1
3,0b91922c,71a4f6d2,6,0
4,9291ca83,d6fb3b9e,15,0
...,...,...,...,...
906,11c2ab99,28c2d81a,8,1
907,550335a3,6d7d5dd7,4,0
908,558df7d4,d8654140,17,0
909,3088ca9c,6f393cfe,15,1


In [36]:
df_temp= df.copy()

In [37]:
df_labels = pd.merge(df, labels, on=['name_file_1', 'name_file_2'], how='left')

In [38]:
df_plagios_detectados = df_labels[df_labels['verdict'] == 1]
df_plagios_no_detectados = df_labels[df_labels['verdict'] == 0]

In [30]:
df_plagios_detectados.head()

Unnamed: 0,Name1,Code1,Name2,Code2,Similitud,problem,verdict
0,0017d438,import java.io.BufferedReader;\nimport java.io...,9852706b,import java.io.BufferedReader;\nimport java.io...,0.659862,9,1
18,034030f3,import java.util.*;\nimport java.io.*;\n\npubl...,bf992c91,import java.util.Arrays;\nimport java.util.Sca...,0.59919,19,1
21,04df7bb8,import java.math.BigInteger;\n\t\t\t\t\t\t\t\t...,1ea771ea,import java.io.*;\nimport java.util.*;\n\n\npu...,0.512458,8,1
22,04df7bb8,import java.math.BigInteger;\n\t\t\t\t\t\t\t\t...,85125ecb,import java.io.*;\nimport java.util.Arrays;\ni...,0.521808,8,1
23,04df7bb8,import java.math.BigInteger;\n\t\t\t\t\t\t\t\t...,aaccc000,import java.io.*;\nimport java.util.*;\n\npubl...,0.553364,8,1


In [39]:
df_plagios_detectados.to_csv('codigos_plagio.csv', index=False)
df_plagios_no_detectados.to_csv('codigos_no_plagio.csv', index=False)