In [7]:
import pandas as pd
import numpy as np
import re
import astroid
import ast
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from radon.complexity import cc_visit_ast
from radon.metrics import h_visit
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
data = pd.read_csv("detection_sample_data.csv")
data = data[["Problem_ID", "Canonical_Solution", "GPT4_Solution"]]
data.columns = ["problem_id", "human", "gpt4"]
data = pd.melt(data, id_vars="problem_id", var_name="source", value_name="code")
data["cleaned_code"] = data["code"].apply(lambda row: (re.sub(r" *#.*\n", "\n", "\n".join([value for value in row[4:].replace("\n    ", "\n").splitlines() if len(value.lstrip())==0 or value.lstrip()[0] != "#"]))).lstrip())

In [9]:
def extract_features(data=data, code_column="cleaned_code", has_comments=False):
    data = data.copy()
    
    data["num_chars"] = data[code_column].str.len()
    data["num_lines"] = data[code_column].apply(lambda row: len(row.splitlines()))
    data["avg_line_length"] = data[code_column].apply(lambda row: np.mean([len(line) for line in row.splitlines() if len(line)>0]))
    data["max_line_length"] = data[code_column].apply(lambda row: max([len(line) for line in row.splitlines() if len(line)>0]))
    data["num_digits"] = (data[code_column].apply(lambda row: len([value for value in row if value.isdigit()])))
    data["num_empty_lines"] = data[code_column].str.count("\n\n")
    data["num_method_declarations"] = data[code_column].str.count("def")
    data["num_local_vars"] = data[code_column].apply(lambda code: len({node.targets[0].as_string() for node in astroid.parse(code).nodes_of_class(astroid.Assign)}))
    data["avg_name_length"] = data[code_column].apply(lambda code: np.mean(list({len(node.targets[0].as_string()) for node in astroid.parse(code).nodes_of_class(astroid.Assign)})))
    data["max_name_length"] = data[code_column].apply(lambda code: max(list({len(node.targets[0].as_string()) for node in astroid.parse(code).nodes_of_class(astroid.Assign)}), default=0))
    data["num_function_calls"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Call))))
    data["num_loops"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.For))) + len(list(astroid.parse(code).nodes_of_class(astroid.While))))
    data["num_if_statements"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.If))))
    data["num_return_statements"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Return))))
    data["num_exceptions_raised"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Raise))))
    data["num_list_comprehensions"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.ListComp))))
    data["num_dict_comprehensions"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.DictComp))))
    data["num_set_comprehensions"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.SetComp))))
    data["num_imported_modules"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Import))))
    data["num_list_operations"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.List))))
    data["num_dict_operations"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Dict))))
    data["num_set_operations"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Set))))
    data["num_lambda_functions"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Lambda))))
    data["num_generator_expressions"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.GeneratorExp))))
    data["num_attributes_accessed"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Attribute))))
    data["cyclomatic_complexity"] = data[code_column].apply(lambda code: sum(item.complexity for item in cc_visit_ast(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n"))))))
    data["halstead_operators"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.h1)
    data["halstead_operands"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.h2)
    data["halstead_length"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.calculated_length)
    data["halstead_volume"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.volume)
    data["halstead_difficulty"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.difficulty)
    data["halstead_effort"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.effort)
    data["halstead_time"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.time)
    data["halstead_bugs"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.bugs)
    data = data.fillna(0)
    
    if has_comments:
        data["num_comments"] = data[code_column].str.count("#")
    
    return data

In [19]:
feature_data = extract_features(data, "cleaned_code").iloc[:, 4:]

  return _methods._mean(a, axis=axis, dtype=dtype,


In [11]:
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b|\S", lowercase=True)
bag_of_words = vectorizer.fit_transform(list(data["cleaned_code"]))
bow_data = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())

In [23]:
full_features = pd.concat([feature_data, bow_data], axis=1)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(full_features.values, data["source"], test_size=0.3, random_state=0)

In [25]:
forest_model = RandomForestClassifier(random_state=0, n_estimators=160, max_features="sqrt")
forest_model.fit(X_train, y_train)
print("Accuracy on test set:", np.round(accuracy_score(y_test, forest_model.predict(X_test)), 3)*100, "%")

Accuracy on test set: 47.5 %


In [26]:
log_model = LogisticRegression(max_iter=5000)
log_model.fit(X_train, y_train)
print("Accuracy on test set:", np.round(accuracy_score(y_test, log_model.predict(X_test)), 3)*100, "%")

Accuracy on test set: 46.5 %
