In [1]:
import joblib
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Convierte los codigos en secuencias de enteros para que puedan ser procesados
def tokenizar(features):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(features['code_java_1'] + features['code_java_2'])
    features_sequence_code1 = tokenizer.texts_to_sequences(features['code_java_1'])
    features_sequence_code2 = tokenizer.texts_to_sequences(features['code_java_2'])
    return features_sequence_code1, features_sequence_code2

In [3]:
# Asegurarse de que todas las secuencias tengan la misma longitud
def padding(features1, features2):
    max_f1 = max(len(seq) for seq in features1) 
    max_f2 = max(len(seq) for seq in features2)
    max_length = max(max_f1, max_f2)

    features_sequence_code1 = pad_sequences(
        features1,
        maxlen = max_length
    )

    features_sequence_code2 = pad_sequences(
        features2,
        maxlen = max_length
    )

    return features_sequence_code1, features_sequence_code2

In [4]:
# Preprocessing function
def preprocess_java_code(code):
    # Remove single-line comments
    code = re.sub(r'//.*', '', code)
    # Remove multi-line comments
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    # Remove extra whitespace and line breaks
    code = re.sub(r'\s+', ' ', code).strip()
    return code

In [5]:
def predict_plagiarism(code1, code2, tokenizer, max_length, model):
    # Tokenización de los códigos
    sequence_code1 = tokenizer.texts_to_sequences([code1])
    sequence_code2 = tokenizer.texts_to_sequences([code2])

    # Padding de las secuencias
    # max_length = max(max(len(seq) for seq in sequence_code1), max(len(seq) for seq in sequence_code2))
    sequence_code1 = pad_sequences(sequence_code1, maxlen=max_length)
    sequence_code2 = pad_sequences(sequence_code2, maxlen=max_length)

    # Realizar la predicción
    # prediction = model.predict(np.concatenate([sequence_code1, sequence_code2], axis=1))
    # print(sequence_code1 + sequence_code2)
    prediction = model.predict(sequence_code1 + sequence_code2)
    
    # Devolver el resultado de la predicción
    return prediction[0]

In [6]:
# Load the saved model
model = joblib.load('random_forest_model.pkl')

In [7]:
path_code1 = "test.java"
path_code2 = "test copy.java"

In [8]:
with open(path_code1, 'r') as file:
    java_code1 = file.read()
print(java_code1)

public class Fibonacci {
    public static void main(String[] args) {
        int n = 10, t1 = 0, t2 = 1;
        System.out.print("First " + n + " terms: ");
        for (int i = 1; i <= n; ++i) {
            System.out.print(t1 + " + ");
            int sum = t1 + t2;
            t1 = t2;
            t2 = sum;
        }
    }
}



In [9]:
with open(path_code2, 'r') as file:
    java_code2 = file.read()
print(java_code2)

public class Fibonacci {
    public static void main(String[] args) {
        int n = 10, t1 = 0, t2 = 1;
        System.out.print("First " + n + " terms: ");
        for (int i = 1; i <= n; ++i) {
            System.out.print(t1 + " + ");
            int sum = t1 + t2;
            t1 = t2;
            t_2 = sum;
        }
    }
}



In [10]:
java_code1 = preprocess_java_code(java_code1)
java_code2 = preprocess_java_code(java_code2)

In [11]:
import pandas as pd

data = {'code_java_1': [java_code1], 'code_java_2': [java_code2]}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,code_java_1,code_java_2
0,public class Fibonacci { public static void ma...,public class Fibonacci { public static void ma...


In [12]:
features = df[['code_java_1', 'code_java_2']]
features_sequence_code1, features_sequence_code2 = tokenizar(features)

In [13]:
features_code1, features_code2 = padding(
    features_sequence_code1,
    features_sequence_code2
)

In [14]:
features_concatenated = np.concatenate(
    (
        features_code1,
        features_code2
    ), 
    axis = 1
)

In [15]:
# Make predictions
predictions = model.predict(features_concatenated)

# Print the predictions
print(predictions)

ValueError: X has 84 features, but RandomForestClassifier is expecting 10398 features as input.