In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

In [84]:
data = pd.read_csv("detection_sample_data.csv")
data = data[["Problem_ID", "Canonical_Solution", "GPT4_Solution"]]
data.columns = ["problem_id", "human", "gpt4"]
data = pd.melt(data, id_vars="problem_id", var_name="source", value_name="code")

In [85]:
data["cleaned_code"] = data["code"].apply(lambda row: (re.sub(r" *#.*\n", "\n", "\n".join([value for value in row[4:].replace("\n    ", "\n").splitlines() if len(value.lstrip())==0 or value.lstrip()[0] != "#"]))).lstrip())

In [91]:
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b|\S", lowercase=True)
bag_of_words = vectorizer.fit_transform(list(data["cleaned_code"]))
feature_data = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())

In [92]:
feature_data

Unnamed: 0,!,"""",#,%,&,',(,),*,+,...,y,year,yes,z,zero,zfill,zip,{,|,}
0,1,0,0,0,0,0,3,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,6,6,6,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,5,5,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,0,0,0,0,0,0,2,2,0,1,...,0,0,0,0,0,0,0,0,0,0
324,0,0,0,0,0,0,5,5,0,2,...,0,0,0,0,0,0,0,0,0,0
325,0,0,0,0,0,2,7,7,0,0,...,0,0,0,0,0,0,0,0,0,0
326,0,2,0,0,0,0,3,3,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
X_train, X_test, y_train, y_test = train_test_split(feature_data.values, data["source"], test_size=0.3, random_state=0)

In [94]:
forest_model = RandomForestClassifier(random_state=0, n_estimators=160, max_features="sqrt")
forest_model.fit(X_train, y_train)
print("Accuracy on test set:", np.round(accuracy_score(y_test, forest_model.predict(X_test)), 3)*100, "%")

Accuracy on test set: 44.4 %


In [95]:
log_model = LogisticRegression(max_iter=5000)
log_model.fit(X_train, y_train)
print("Accuracy on test set:", np.round(accuracy_score(y_test, log_model.predict(X_test)), 3)*100, "%")

Accuracy on test set: 46.5 %
