In [1]:
!pip install langchain transformers torch datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from langchain.prompts import PromptTemplate
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

In [4]:
task_prompt = PromptTemplate(
    input_variables=["original_code", "submission_code"],
    template="""
You are a code analysis expert specializing in plagiarism detection. Your task is to classify the plagiarism level between two code snippets provided below.

### Levels Description and Transformations:
Level 0 (Non-Plagiarized):

Completely unrelated functionality.
Example: The submission performs a different task, such as calculating a square instead of a sum.

Level 1 (Comment and Whitespace Modification):

Identical functionality but with minor superficial changes.
Example: Adding/removing comments or altering indentation.

Level 2 (Identifier Modification):

Changes limited to renaming variables or constants.
Example: Renaming int a to int x.

Level 3 (Component Declaration Relocation):

Reordering code components (e.g., changing the order of method definitions).
Example: Placing variable declarations at different positions.

Level 4 (Method Structure Change):

Encapsulation of statements into methods or restructuring logic.
Example: Moving logic into a helper method.

Level 5 (Program Statement Replacement):

Significant changes to the structure, such as replacing a loop with a conditional.
Example: Replacing a for loop with a while loop.

Level 6 (Logic Change):

Substantial logic transformations, such as replacing loops with recursion.
Example: Implementing the same functionality using recursive methods.

### Code Snippets:
**Original Code:**
{original_code}

**Submission Code:**
{submission_code}

### Response:
- Predicted Label: [Provide one label from the list above]
- Explanation: [Provide a clear and concise justification for your choice, highlighting the key differences or similarities observed.]
"""
)


In [5]:
# Load fine-tuned GraphCodeBERT
graphcodebert_tokenizer = AutoTokenizer.from_pretrained("YoussefHassan/graphcodebert-plagiarism-detector")
graphcodebert_model = AutoModelForSequenceClassification.from_pretrained("YoussefHassan/graphcodebert-plagiarism-detector")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [6]:
# Load fine-tuned CodeT5
codet5_tokenizer = AutoTokenizer.from_pretrained("YoussefHassan/codet5-multiclass-plagiarism-detector")
codet5_model = AutoModelForSequenceClassification.from_pretrained("YoussefHassan/codet5-multiclass-plagiarism-detector")

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/511k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.29M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/15.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/894M [00:00<?, ?B/s]

In [7]:
# Load fine-tuned UniXcoder
unixcoder_tokenizer = AutoTokenizer.from_pretrained("YoussefHassan/unixcoder-multiclass-plagiarism-detector")
unixcoder_model = AutoModelForSequenceClassification.from_pretrained("YoussefHassan/unixcoder-multiclass-plagiarism-detector")

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/835k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/444k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/504M [00:00<?, ?B/s]

In [8]:
import numpy as np

def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits))
    return exp_logits / exp_logits.sum(axis=-1, keepdims=True)

In [9]:
def ensemble_prediction(original_code, submission_code):
    # Tokenize inputs
    inputs = {"input_ids": None, "attention_mask": None}

    # GraphCodeBERT
    graphcodebert_inputs = graphcodebert_tokenizer(
        original_code + " " + submission_code, return_tensors="pt", padding=True, truncation=True
    )
    graphcodebert_logits = graphcodebert_model(**graphcodebert_inputs).logits.detach().numpy()
    graphcodebert_softmax = softmax(graphcodebert_logits)

    # CodeT5
    codet5_inputs = codet5_tokenizer(
        original_code + " " + submission_code, return_tensors="pt", padding=True, truncation=True
    )
    codet5_logits = codet5_model(**codet5_inputs).logits.detach().numpy()
    codet5_softmax = softmax(codet5_logits)

    # UniXcoder
    unixcoder_inputs = unixcoder_tokenizer(
        original_code + " " + submission_code, return_tensors="pt", padding=True, truncation=True
    )
    unixcoder_logits = unixcoder_model(**unixcoder_inputs).logits.detach().numpy()
    unixcoder_softmax = softmax(unixcoder_logits)

    # Ensemble: Sum SoftMax vectors
    combined_softmax = graphcodebert_softmax + codet5_softmax + unixcoder_softmax
    predicted_label = combined_softmax.argmax()
    print("GraphCodeBERT SoftMax:", graphcodebert_softmax)
    print("CodeT5 SoftMax:", codet5_softmax)
    print("UniXcoder SoftMax:", unixcoder_softmax)


    return predicted_label, combined_softmax

In [10]:
def explain_decision(original_code, submission_code, predicted_label):
    explanation = f"""
The predicted label is L{predicted_label}.
This classification was determined by combining predictions from three fine-tuned models:
1. GraphCodeBERT
2. CodeT5
3. UniXcoder

The final decision was based on summing the confidence vectors (SoftMax probabilities) from all three models.
Key factors considered:
- Original Code: {original_code}
- Submission Code: {submission_code}

The ensemble approach helps improve accuracy by aggregating predictions from specialized models.
    """
    return explanation


In [11]:
# Example Inputs
original_code ="""
import java.util.Scanner;

public class T2 {
	public static void main(String[] args) {
		Scanner input = new Scanner(System.in);

		// Enter radius of the cylinder
		System.out.print("Enter the radius and length of a cylinder: ");
		double radius = input.nextDouble();
		double length = input.nextDouble();

		double area = radius * radius * 3.14159;
		double volume = area * length;

		System.out.println("The area is " + area);
		System.out.println("The volume of the cylinder is " + volume);
	}

}

"""
submission_code = """
import java.util.*;

public class L4 {

    public static double hitLuas(double jari2) {
        return jari2 * jari2 * 3.14159;
    }

    public static double total(double luas, double panjang) {
        return luas * panjang;
    }

    public static void main(String[] args) {
        Scanner input = new Scanner(System.in);
        double jari2 = 0;
        double panjang = 0;
        System.out.print("Enter the radius and length of a cylinder: ");
        jari2 = input.nextDouble();
        panjang = input.nextDouble();
        double luas = hitLuas(jari2);
        double total = total(luas,panjang);

        System.out.print("The area is " + luas + " , ");
        System.out.println("The volume of the cylinder is " + total);
    }
}
"""
# Get Prediction
predicted_label, combined_softmax = ensemble_prediction(original_code, submission_code)

# Explain Decision
explanation = explain_decision(original_code, submission_code, predicted_label)

print("Predicted Label:", predicted_label)
print("SoftMax Vector:", combined_softmax)
print("Explanation:", explanation)


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


GraphCodeBERT SoftMax: [[0.02834815 0.03109244 0.1637621  0.14860326 0.2866491  0.17903009
  0.1625149 ]]
CodeT5 SoftMax: [[0.00229845 0.12259317 0.14569628 0.24261717 0.26338577 0.10324603
  0.12016314]]
UniXcoder SoftMax: [[0.00362958 0.00623689 0.10409459 0.11542264 0.5499105  0.17150675
  0.04919904]]
Predicted Label: 4
SoftMax Vector: [[0.03427618 0.15992251 0.41355297 0.50664306 1.0999453  0.45378286
  0.33187708]]
Explanation: 
The predicted label is L4.
This classification was determined by combining predictions from three fine-tuned models:
1. GraphCodeBERT
2. CodeT5
3. UniXcoder

The final decision was based on summing the confidence vectors (SoftMax probabilities) from all three models.
Key factors considered:
- Original Code: 
import java.util.Scanner;

public class T2 {
	public static void main(String[] args) {
		Scanner input = new Scanner(System.in);

		// Enter radius of the cylinder
		System.out.print("Enter the radius and length of a cylinder: ");
		double radius = in

In [12]:
# Example Inputs
original_code ="""
public class T6 {
	public static void main(String[] args) {
		java.util.Scanner input = new java.util.Scanner(System.in);
		int[] num = new int[10];

		for (int i = 0; i < 10; i++) {
			// Read a number
			System.out.print("Read a number: ");

			num[i] = input.nextInt();
		}

		// Display the array
		for (int i = 9; i >= 0; i--) {
			System.out.println(num[i]);
		}
	}

}

"""
submission_code = """
import java.util.Scanner;

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
/**
 *
 * @author FD8DBE9073AFCC0504CD6901E1091CAD
 */
public class Level6
{
    static Scanner sc = new Scanner(System.in);

    public static void tampil(int[]angka){
        for (int x = 10; x > 0; x--)
            System.out.println(angka[x]);
    }

    public static void main(String[] args)
    {
        int arrAngka[] = new int[10];

        for (int x = 0; x < arrAngka.length; x++)
        {
            System.out.print("Read a number: ");
            arrAngka[x] = sc.nextInt();
        }

        tampil(arrAngka);
    }
}
"""
# Get Prediction
predicted_label, combined_softmax = ensemble_prediction(original_code, submission_code)

# Explain Decision
explanation = explain_decision(original_code, submission_code, predicted_label)

print("Predicted Label:", predicted_label)
print("SoftMax Vector:", combined_softmax)
print("Explanation:", explanation)


GraphCodeBERT SoftMax: [[0.02311264 0.02235926 0.12139399 0.08852939 0.20117028 0.19417673
  0.34925768]]
CodeT5 SoftMax: [[0.05663529 0.01206629 0.13320103 0.21416384 0.16262923 0.15804002
  0.26326436]]
UniXcoder SoftMax: [[0.04788574 0.00494075 0.06588882 0.24439141 0.45244747 0.095805
  0.08864086]]
Predicted Label: 4
SoftMax Vector: [[0.12763366 0.0393663  0.32048386 0.5470847  0.816247   0.44802174
  0.7011629 ]]
Explanation: 
The predicted label is L4.
This classification was determined by combining predictions from three fine-tuned models:
1. GraphCodeBERT
2. CodeT5
3. UniXcoder

The final decision was based on summing the confidence vectors (SoftMax probabilities) from all three models.
Key factors considered:
- Original Code: 
public class T6 {
	public static void main(String[] args) {
		java.util.Scanner input = new java.util.Scanner(System.in);
		int[] num = new int[10];

		for (int i = 0; i < 10; i++) {
			// Read a number
			System.out.print("Read a number: ");

			num[i]

In [19]:
import pandas as pd
from sklearn.metrics import accuracy_score

test_data_path = '/content/Balanced_Test_Cases_V3.csv'
test_data = pd.read_csv(test_data_path)

test_case = test_data.iloc[1]
original_code = test_case['OriginalCode']
submission_code = test_case['SubmissionCode']  # Updated column name
actual_label = test_case['PlagiarismLevel']

predicted_label, combined_softmax = ensemble_prediction(original_code, submission_code)

print("Original Code:")
print(original_code)
print("\nSubmission Code:")
print(submission_code)
print("\nActual Label (Mapped):", actual_label)
print("Predicted Label:", predicted_label)
print("SoftMax Output:", combined_softmax)

predictions = []
for index, row in test_data.iterrows():
    original_code = row['OriginalCode']
    submission_code = row['SubmissionCode']
    predicted_label, _ = ensemble_prediction(original_code, submission_code)
    predictions.append(predicted_label)

accuracy = accuracy_score(test_data['PlagiarismLevel'], predictions)
print(f"\nTest Accuracy: {accuracy:.2f}")


GraphCodeBERT SoftMax: [[0.9769772  0.0038514  0.00298833 0.00265348 0.00375645 0.0044082
  0.00536482]]
CodeT5 SoftMax: [[0.69543654 0.02631724 0.04715106 0.05617344 0.03069691 0.05821155
  0.08601332]]
UniXcoder SoftMax: [[9.9422944e-01 8.1406877e-04 1.1318611e-03 5.6584424e-04 9.0755679e-04
  1.0958030e-03 1.2553957e-03]]
Original Code:

public class T5 {
	public static void main(String[] args) {
		System.out.print("Enter an integer: ");
		java.util.Scanner input = new java.util.Scanner(System.in);
		int number = input.nextInt();
		reverse(number);
	}

	public static void reverse(int number) {
		while (number != 0) {
			int remainder = number % 10;
			System.out.print(remainder);
			number = number / 10;
		}

		System.out.println();
	}

}


Submission Code:

import java.util.Scanner;

/**
 *
 * @author 92E0988C1682C76D4D307AA15EC8346E
 */
public class T05 {

    public static void main(String[] args) {
        Scanner sc = new Scanner(System.in);
        System.out.print("Enter an i