In [5]:
import javalang
import numpy as np
from scipy.spatial.distance import euclidean

# Function to calculate similarity between two sets of features
def calculate_similarity(features1, features2):
    # Convert feature dictionaries to vectors
    vector1 = np.array(list(features1.values()))
    vector2 = np.array(list(features2.values()))
    
    # Calculate Euclidean distance
    distance = euclidean(vector1, vector2)
    return distance

# Function to extract features from the AST
def extract_features(ast):
    features = {}
    
    # Example feature: number of nodes
    features['num_nodes'] = len(ast['children']) if 'children' in ast else 0
    
    # Example feature: depth of the tree
    features['depth'] = calculate_depth(ast)
    
    return features

def calculate_depth(ast, current_depth=0):
    if not ast['children']:
        return current_depth
    return max(calculate_depth(child, current_depth + 1) for child in ast['children'])

# Function to generate the AST
def generate_ast(java_code):
    tokens = javalang.tokenizer.tokenize(java_code)
    parser = javalang.parser.Parser(tokens)
    tree = parser.parse()
    return tree

# Function to normalize AST by converting identifiers to a standard form
def normalize_ast(node):
    if isinstance(node, javalang.ast.Node):
        normalized_node = {
            'type': type(node).__name__,
            'attributes': {},
            'children': []
        }
        for attr, value in node.__dict__.items():
            if isinstance(value, list):
                normalized_node['children'].extend([normalize_ast(child) for child in value if isinstance(child, javalang.ast.Node)])
            elif isinstance(value, javalang.ast.Node):
                normalized_node['children'].append(normalize_ast(value))
            else:
                # Normalize identifiers and literals
                if attr in ('name', 'value'):
                    normalized_node['attributes'][attr] = 'normalized'
                else:
                    normalized_node['attributes'][attr] = value
        return normalized_node
    return str(node)

# Example usage
java_code_example1 = """
public class HelloWorld {
    public static void main(String[] args) {
        System.out.println("Hello, World!");
    }
}
"""

java_code_example2 = """
public class HelloWorld {
    public static void main(String[] args) {
        System.out.println("Hello, Java!");
         System.out.println("Name is , Jarvis!");
    }
}
"""



In [6]:
ast1 = generate_ast(java_code_example1)
ast2 = generate_ast(java_code_example2)
ast1_normalized = normalize_ast(ast1)
ast2_normalized = normalize_ast(ast2)

print(ast1_normalized)
print(ast2_normalized)


features1 = extract_features(ast1_normalized)
features2 = extract_features(ast2_normalized)

print(features1)
print(features2)


similarity = calculate_similarity(features1, features2)
print(f"Similarity (Euclidean distance): {similarity}")

{'type': 'CompilationUnit', 'attributes': {'package': None}, 'children': [{'type': 'ClassDeclaration', 'attributes': {'modifiers': {'public'}, 'documentation': None, 'name': 'normalized', 'type_parameters': None, 'extends': None, 'implements': None, '_position': Position(line=2, column=8)}, 'children': [{'type': 'MethodDeclaration', 'attributes': {'documentation': None, 'modifiers': {'static', 'public'}, 'type_parameters': None, 'return_type': None, 'name': 'normalized', 'throws': None, '_position': Position(line=3, column=19)}, 'children': [{'type': 'FormalParameter', 'attributes': {'modifiers': set(), 'name': 'normalized', 'varargs': False, '_position': Position(line=3, column=29)}, 'children': [{'type': 'ReferenceType', 'attributes': {'name': 'normalized', 'arguments': None, 'sub_type': None}, 'children': []}]}, {'type': 'StatementExpression', 'attributes': {'label': None, '_position': Position(line=4, column=9)}, 'children': [{'type': 'MethodInvocation', 'attributes': {'qualifier':

In [12]:
import javalang
import numpy as np
from scipy.spatial.distance import euclidean
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction import DictVectorizer


# Prepare the data for classification
features = [features1, features2]
labels = [1, 0]  # Example labels: 1 for plagiarized, 0 for non-plagiarized

# Convert feature dictionaries to numerical vectors
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(features)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.5, random_state=42)

# Train the classifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))


Accuracy: 0.0
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
