In [1]:
import pandas as pd
import numpy as np
import re
import astroid
import ast
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from radon.complexity import cc_visit_ast
from radon.metrics import h_visit
import matplotlib.pyplot as plt
import seaborn as sns
import json
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [39]:
def format_code(data, keep_comments=False):
    if keep_comments:
        data["cleaned_code"] = data["code"].apply(lambda row: "\n".join([value for value in row[4:].replace("\n    ", "\n").splitlines()]))
    else:
        data["cleaned_code"] = data["code"].apply(lambda row: (re.sub(r" *#.*\n", "\n", "\n".join([value for value in row[4:].replace("\n    ", "\n").splitlines() if len(value.lstrip())==0 or value.lstrip()[0] != "#"]))).lstrip())
    return data

In [40]:
def extract_features(data, code_column="cleaned_code", has_comments=False):
    data = data.copy()
    data["num_chars"] = data[code_column].str.len()
    data["num_lines"] = data[code_column].apply(lambda row: len(row.splitlines()))
    data["avg_line_length"] = data[code_column].apply(lambda row: np.mean([len(line) for line in row.splitlines() if len(line)>0]))
    data["max_line_length"] = data[code_column].apply(lambda row: max([len(line) for line in row.splitlines() if len(line)>0]))
    data["num_digits"] = (data[code_column].apply(lambda row: len([value for value in row if value.isdigit()])))
    data["num_empty_lines"] = data[code_column].str.count("\n\n")
    data["num_whitespace"] = data["code"].str.count(" ")
    data["num_indents"] = data["code"].apply(lambda row: sum([len(line) - len(line.lstrip()) for line in row.splitlines()]))
    data["num_method_declarations"] = data[code_column].str.count("def")
    data["num_break"] = data[code_column].str.count("break")
    data["num_continue"] = data[code_column].str.count("continue")
    data["num_with"] = data[code_column].str.count("with")
    data["num_assert"] = data[code_column].str.count("assert")
    data["num_except"] = data[code_column].str.count("except")
    data["num_not"] = data[code_column].str.count("not")
    data["num_or"] = data[code_column].str.count("or")
    data["num_and"] = data[code_column].str.count("and")
    data["num_none"] = data[code_column].str.count("None")
    data["num_in"] = data[code_column].str.count(" in ")
    data["num_yield"] = data[code_column].str.count("yield")
    data["num_pass"] = data[code_column].str.count("pass")
    data["num_raise"] = data[code_column].str.count("raise")
    data["num_for"] = data[code_column].str.count("for")
    data["num_while"] = data[code_column].str.count("while")
    data["num_comparators"] = data[code_column].str.count("==") + data[code_column].str.count("<") + data[code_column].str.count(">") + data[code_column].str.count("!=")
    data["num_local_vars"] = data[code_column].apply(lambda code: len({node.targets[0].as_string() for node in astroid.parse(code).nodes_of_class(astroid.Assign)}))
    data["avg_name_length"] = data[code_column].apply(lambda code: np.mean(list({len(node.targets[0].as_string()) for node in astroid.parse(code).nodes_of_class(astroid.Assign)})))
    data["max_name_length"] = data[code_column].apply(lambda code: max(list({len(node.targets[0].as_string()) for node in astroid.parse(code).nodes_of_class(astroid.Assign)}), default=0))
    data["num_function_calls"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Call))))
    data["num_loops"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.For))) + len(list(astroid.parse(code).nodes_of_class(astroid.While))))
    data["num_if_statements"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.If))))
    data["num_return_statements"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Return))))
    data["num_exceptions_raised"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Raise))))
    data["num_list_comprehensions"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.ListComp))))
    data["num_dict_comprehensions"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.DictComp))))
    data["num_set_comprehensions"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.SetComp))))
    data["num_imported_modules"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Import))))
    data["num_list_operations"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.List))))
    data["num_dict_operations"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Dict))))
    data["num_set_operations"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Set))))
    data["num_lambda_functions"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Lambda))))
    data["num_generator_expressions"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.GeneratorExp))))
    data["num_attributes_accessed"] = data[code_column].apply(lambda code: len(list(astroid.parse(code).nodes_of_class(astroid.Attribute))))
    data["cyclomatic_complexity"] = data[code_column].apply(lambda code: sum(item.complexity for item in cc_visit_ast(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n"))))))
    data["halstead_operators"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.h1)
    data["halstead_operands"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.h2)
    data["halstead_length"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.calculated_length)
    data["halstead_volume"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.volume)
    data["halstead_difficulty"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.difficulty)
    data["halstead_effort"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.effort)
    data["halstead_time"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.time)
    data["halstead_bugs"] = data[code_column].apply(lambda code: h_visit(ast.parse(f"def temp():\n" + "\n".join(f"    {line}" for line in code.split("\n")))).total.bugs)
    data = data.fillna(0)
    
    if has_comments:
        data["num_comments"] = data[code_column].str.count("#")
    
    return data

In [41]:
def create_bow(data, code_column="cleaned_code"):
    vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b|\S", lowercase=True)
    bag_of_words = vectorizer.fit_transform(list(data["cleaned_code"]))
    bow_data = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())
    return bow_data

## Classify the sample data using a feature approach

### Features

In [42]:
sample_data = pd.read_csv("data/sample_data.csv")
sample_data = sample_data[["Problem_ID", "Canonical_Solution", "GPT4_Solution"]]
sample_data.columns = ["problem_id", "human", "gpt4"]
sample_data = pd.melt(sample_data, id_vars="problem_id", var_name="source", value_name="code")
sample_data_comments = sample_data.copy()

In [43]:
sample_data = format_code(sample_data)
sample_data = extract_features(sample_data)
sample_data.to_csv("data/sample_data_features.csv", index=False)

In [44]:
sample_data_comments = format_code(sample_data_comments, keep_comments=True)
sample_data_comments = extract_features(sample_data_comments, has_comments=True)

In [45]:
print("Number of features:", len(list(sample_data.columns[4:])))

Number of features: 52


In [46]:
X_train, X_test, y_train, y_test = train_test_split(sample_data[features].values, sample_data["source"], test_size=0.3, random_state=0)
forest_model = RandomForestClassifier(random_state=0)
forest_model.fit(X_train, y_train)
print("Accuracy on test set:", np.round(accuracy_score(y_test, forest_model.predict(X_test)), 3)*100, "%")

Accuracy on test set: 52.5 %


In [47]:
X_train, X_test, y_train, y_test = train_test_split(sample_data_comments[features].values, sample_data_comments["source"], test_size=0.3, random_state=0)
forest_model = RandomForestClassifier(random_state=0)
forest_model.fit(X_train, y_train)
print("Accuracy on test set:", np.round(accuracy_score(y_test, forest_model.predict(X_test)), 3)*100, "%")

Accuracy on test set: 59.599999999999994 %


### Bag of words

In [16]:
bow_data = create_bow(sample_data)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(bow_data.values, sample_data["source"], test_size=0.3, random_state=0)

In [18]:
forest_model = RandomForestClassifier(random_state=0, n_estimators=160, max_features="sqrt")
forest_model.fit(X_train, y_train)
print("Accuracy on test set:", np.round(accuracy_score(y_test, forest_model.predict(X_test)), 3)*100, "%")

Accuracy on test set: 44.4 %


### Features and bag of words

In [19]:
sample_bow_data = pd.concat([sample_data, bow_data], axis=1)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(sample_bow_data.iloc[:,4:].values, sample_bow_data["source"], test_size=0.3, random_state=0)

In [21]:
forest_model = RandomForestClassifier(random_state=0, n_estimators=160, max_features="sqrt")
forest_model.fit(X_train, y_train)
print("Accuracy on test set:", np.round(accuracy_score(y_test, forest_model.predict(X_test)), 3)*100, "%")

Accuracy on test set: 51.5 %


## Classify the larger dataset using a feature approach

In [15]:
big_data = []
with open("data/all_passed_results.jsonl", 'r') as file:
    for line in file:
        json_obj = json.loads(line)
        big_data.append(json_obj)
big_data = pd.DataFrame(big_data)

failed_parses = []
for i in range(len(big_data)):
    try:
        astroid.parse(big_data.loc[i, "code"])
    except:
        failed_parses.append(i)
        
big_data = big_data.drop(failed_parses, axis=0).reset_index(drop=True)
big_data["is_gpt"] = big_data["is_gpt"].replace({True:"gpt", False:"human"})

In [16]:
big_data["source"].value_counts()

APPS     69221
MBPPD     1493
HED        202
Name: source, dtype: int64

In [17]:
big_data["is_gpt"].value_counts(normalize=True)*100

human    96.353432
gpt       3.646568
Name: is_gpt, dtype: float64

In [18]:
big_data = extract_features(big_data, "code", has_comments=True)

In [21]:
big_features = list(big_data.columns[6:])

In [41]:
X_train, X_test, y_train, y_test = train_test_split(big_data[big_features].values, big_data["is_gpt"], test_size=0.3, random_state=0)
forest_model = RandomForestClassifier(random_state=0,  class_weight="balanced")
forest_model.fit(X_train, y_train)
print("Accuracy on test set:", np.round(accuracy_score(y_test, forest_model.predict(X_test)), 3)*100, "%")

Accuracy on test set: 96.39999999999999 %


In [42]:
# percentage of each source that was misclassified
# 93% of the gpt solutions were wrongly classified as human
misclassified = y_test != forest_model.predict(X_test)
big_data.loc[misclassified[misclassified].index, :]["is_gpt"].value_counts() / big_data.loc[y_test.index, :]["is_gpt"].value_counts()*100

gpt      94.310999
human     0.078110
Name: is_gpt, dtype: float64

## Classify the larger dataset with SMOTE using a feature approach

In [43]:
X_train, X_test, y_train, y_test = train_test_split(big_data[big_features].values, big_data["is_gpt"], test_size=0.3, random_state=0)

In [44]:
train_gpt = big_data[big_data["is_gpt"]!="human"].sample(frac=0.7, random_state=0)
test_gpt = big_data[big_data["is_gpt"]!="human"].drop(train_gpt.index)

test_human = big_data[big_data["is_gpt"]=="human"].sample(len(test_gpt), random_state=0)
train_human = big_data[big_data["is_gpt"]=="human"].drop(test_human.index)

train = pd.concat([train_gpt, train_human])
test = pd.concat([test_gpt, test_human])

X_train = train[big_features].values
X_test = test[big_features].values
y_train = train["is_gpt"]
y_test = test["is_gpt"]

In [45]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [46]:
y_res.value_counts()

gpt      67554
human    67554
Name: is_gpt, dtype: int64

In [47]:
forest_model = RandomForestClassifier(random_state=0)
forest_model.fit(X_res, y_res)
print("Accuracy on test set:", np.round(accuracy_score(y_test, forest_model.predict(X_test)), 3)*100, "%")

Accuracy on test set: 55.50000000000001 %


In [48]:
# 87% of the gpt solutions were misclassified
misclassified = y_test != forest_model.predict(X_test)
big_data.loc[misclassified[misclassified].index, :]["is_gpt"].value_counts() / big_data.loc[y_test.index, :]["is_gpt"].value_counts()*100

gpt      88.917526
human          NaN
Name: is_gpt, dtype: float64