In [10]:
import json
import asyncio

import javalang

from data.dataset_factory import get_dataset_generator
from data.data_generators.schema import CodeSample
from preprocessing.comment_stripper import strip_java_comments
from preprocessing.embeddings import create_ast
from preprocessing.hier_split import hierarchical_split, hierarchical_split_2, linearize_ast_for_unixcoder
from preprocessing.java_ast_async import extract_classes_async, parse_and_extract_classes, flatten_ast, clean_chunk

from visualizer.serialize_ast import serialize_ast

In [11]:
# Get a random set of functions
gen = get_dataset_generator(
    dataset_name='sourcecodeplag',
    mode='plagiarized',
    **{}
    )

pair: CodeSample = next(gen)

code_a = pair.code_a
print(code_a)

import java.util.Scanner;

public class T7 {
	public static void main(String[] args) {
		Scanner input = new Scanner(System.in);

		System.out.print("Enter a 4 by 4 matrix row by row: ");
		double[][] m = new double[4][4];

		for (int i = 0; i < 4; i++)
			for (int j = 0; j < 4; j++)
				m[i][j] = input.nextDouble();

		System.out.print("Sum of the elements in the major diagonal is " + sumMajorDiagonal(m));
	}

	public static double sumMajorDiagonal(double[][] m) {
		double sum = 0;

		for (int i = 0; i < m.length; i++)
			sum += m[i][i];

		return sum;
	}

}



In [12]:
hierarchy = hierarchical_split_2(code_a)

print(json.dumps(hierarchy, indent=2, default=serialize_ast))

{
  "method_name": "main",
  "parameters": [
    "args"
  ],
  "body": [
    {
      "type": "LocalVariableDeclaration",
      "code": "input = ClassCreator(arguments=[MemberReference(member=in, postfix_operators=[], prefix_operators=[], qualifier=System, selectors=[])], body=None, constructor_type_arguments=None, postfix_operators=[], prefix_operators=[], qualifier=None, selectors=[], type=ReferenceType(arguments=None, dimensions=None, name=Scanner, sub_type=None));",
      "ast_node": {
        "modifiers": "set()",
        "annotations": [],
        "type": {
          "name": "Scanner",
          "dimensions": [],
          "arguments": null,
          "sub_type": null
        },
        "declarators": [
          {
            "name": "input",
            "dimensions": [],
            "initializer": {
              "prefix_operators": [],
              "postfix_operators": [],
              "qualifier": null,
              "selectors": [],
              "type": {
                "

# Function chunking
Reduce functions into meaningful parts with its assigned CHunk metadata (AST in this case)

In [13]:
chunks = flatten_ast(hierarchy)
for i, chunk in enumerate(chunks, 1):
    cleaned_chunk = clean_chunk(chunk)
    print(f"Chunk {i}:\n{cleaned_chunk}\n---")

Chunk 1:
input = ClassCreator(arguments=[in], , , , , , , type=ReferenceType(, , name=Scanner, ));
---
Chunk 2:
MethodInvocation(arguments=["Enter a 4 by 4 matrix row by row: "], member=print, , , System.out, , )
---
Chunk 3:
m = ArrayCreator(dimensions=[4, 4], , , , , , type=BasicType(, name=double));
---
Chunk 4:
for (ForControl(condition=BinaryOperation(operandl=i, operandr=4, operator=<), init=VariableDeclaration(, declarators=[VariableDeclarator(, initializer=0, name=i)], modifiers=set(), type=BasicType(dimensions=[], name=int)), update=[i])) { ... }
---
Chunk 5:
for (ForControl(condition=BinaryOperation(operandl=j, operandr=4, operator=<), init=VariableDeclaration(, declarators=[VariableDeclarator(, initializer=0, name=j)], modifiers=set(), type=BasicType(dimensions=[], name=int)), update=[j])) { ... }
---
Chunk 6:
Assignment(expressionl=m), ArraySelector(index=j)]), type==, value=MethodInvocation(arguments=[], member=nextDouble, , , input, , ))
---
Chunk 7:
MethodInvocation(argu