In [1]:
import pandas as pd
import numpy as np
import scipy as sp

from tqdm import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt

import bokeh
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, CustomJS, Div, Select, Label, LegendItem
from bokeh.models.tools import TapTool
from bokeh.transform import factor_cmap, factor_mark
from bokeh.layouts import row, column

In [2]:
# import the data
data = pd.read_csv("processed_code_solutions/features_data.csv")
# drop the non-continuous features
data = data.drop(["used_boolean", "used_List", "used_Integer", "used_Point", "used_ArrayList", "used_StringBuilder"], axis=1)
# format the source names
data["source"] = data["source"].replace({"bard":"Bard", "gpt3.5":"ChatGPT-3.5", "bing":"Bing", "gpt4":"ChatGPT-4", "student":"Human"})
# create binary AI vs human data for the classifier - 1 is human, 0 is ai
data["binary_source"] = data["source"].apply(lambda row: 1 if row=="Human" else 0)

In [3]:
# rename the columns to understandable names
data.columns = ["name", "source", "style", "version", "code", "Number of characters", "Number of lines", 
                "Average line length", "Maximum line length", "Number of comments (scaled to length of code)", 
                "Number of if statements (scaled to length of code)", "Number of for loops (scaled to length of code)", 
                "Number of switch statements (scaled to length of code)", "Number of digits (scaled to length of code)", 
                "Number of exceptions thrown (scaled to length of code)", "Number of empty lines (scaled to length of code)", 
                "Number of print statements (scaled to length of code)", "Number of files", 
                "Number of method declarations (scaled to length of code)", "Number of field variables declared (scaled to length of code)", 
                "Number of local variables declared (scaled to length of code)", "Number of classes (scaled to length of code)", 
                "Number of variables referenced (scaled to length of code)", "Number of method invocations (scaled to length of code)", 
                "Number of imports (scaled to length of code)", "Average variable name length", "Maximum variable name length", 
                "Average comment length", "Maximum comment length", "binary_source"]

In [4]:
# process the data to be suitable for the visualisation
data["source"] = pd.Categorical(data["source"], ["Bard", "ChatGPT-3.5", "Bing", "ChatGPT-4", "Human"])
data = data.sort_values("source").reset_index(drop=True)
data["name"] = data["name"].str.replace("student", "")
data["name"] = data["source"].astype(str) + "_" + data["name"]
data.loc[data["source"]=="Bing", "name"] = data[data["source"]=="Bing"]["name"] + "_" + data[data["source"]=="Bing"]["version"]
data["code"] = data["code"].str.replace("\n", "<br>")

In [5]:
# sort the variables in the selection by how well they classify the data
all_features = list(data.columns[5:-1])
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(data[all_features].values, 
                                                                        data["binary_source"], 
                                                                        test_size=0.3, 
                                                                        random_state=512)
full_forest = RandomForestClassifier(random_state=0, 
                                     max_features=1, 
                                     n_estimators=160, 
                                     max_depth=10)
full_forest.fit(X_train_full, y_train_full)
all_features = list(pd.Series(full_forest.feature_importances_, index=all_features).sort_values(ascending=False).index)
accuracy_score(y_test_full, full_forest.predict(X_test_full))*100

86.66666666666667

In [6]:
# additional data needed for the visualisation
markers = ["plus", "hex", "triangle", "square", "circle"]
category = ["Bard", "ChatGPT-3.5", "Bing", "ChatGPT-4", "Human"]
colours = ["#e5c949", "#e99675", "#95a3c3", "#a2c865", "#db96c0"]
db_colours = {0:"#bae083", 1:"#a2457d"}
starting_x = "Number of lines"
starting_y = "Number of field variables declared (scaled to length of code)"

In [None]:
# find the decision boundary meshes for every combination of pairs of variables
all_meshes = []
# find the acccuracy of the model on each pair of variables
all_accuracy = dict()

# find all permutations of pairs of variables - the order is relevant, as the mesh will be mirrored
for var1 in tqdm(all_features):
    for var2 in all_features:
        
        # use only the data for the given 2 variables to train a random forest classifier
        X = data[[var1, var2]].values
        y = data["binary_source"].values
        X_train, X_test, y_train, y_test = train_test_split(X, 
                                                            y, 
                                                            test_size=0.4, 
                                                            random_state=0)
        forest = RandomForestClassifier(random_state=0, 
                                        max_features="sqrt", 
                                        n_estimators=180, 
                                        max_depth=5)
        forest.fit(X_train, y_train)

        # find the boundary edges for the mesh 
        min_var1, max_var1 = X[:, 0].min(), X[:, 0].max()
        min_var2, max_var2 = X[:, 1].min(), X[:, 1].max()

        # define the grid for the mesh
        mesh = np.meshgrid(np.linspace(min_var1, max_var1, 80), np.linspace(min_var2, max_var2, 85))

        # defining the colours/predictions on the mesh
        predictions = forest.predict(np.c_[mesh[0].ravel(), mesh[1].ravel()])
        predictions = predictions.reshape(mesh[0].shape)    

        # combining the mesh positions and colours/predictions
        mesh = pd.DataFrame(np.c_[mesh[0].ravel(), 
                                  mesh[1].ravel(), 
                                  predictions.ravel()], 
                            columns=[f"{var1}_{var2}_xx", f"{var1}_{var2}_yy", f"{var1}_{var2}_pred"])
        mesh[f"{var1}_{var2}_colours"] = mesh[f"{var1}_{var2}_pred"].map(db_colours)
        
        # saving the mesh to the overall list
        all_meshes.append(mesh)
        
        # save the accuracy of the model
        all_accuracy[f"{var1}_{var2}"] = "{:.1f}".format(accuracy_score(y_test, forest.predict(X_test))*100)
        
all_meshes = pd.concat(all_meshes, axis=1)

In [None]:
all_meshes.to_csv("processed_code_solutions/all_meshes.csv", index=False)
np.save("processed_code_solutions/all_accuracy.npy", all_accuracy)

In [7]:
all_meshes = pd.read_csv("processed_code_solutions/all_meshes.csv")
all_accuracy = np.load("processed_code_solutions/all_accuracy.npy", allow_pickle="TRUE").item()

In [8]:
# store the data in column data source format for Bokeh
cds_data = ColumnDataSource(data)
cds_mesh = ColumnDataSource(all_meshes)

# create the initial figure
plot = figure(plot_width=600, 
              plot_height=600, 
              tools=["tap", "save", "reset"],
              x_axis_label=starting_x, 
              y_axis_label=starting_y)
click_desc = Div(text="""<div style='width: 600px; margin: 0 auto;'><p style='font-size: 15px; font-weight: bold; text-align: center;'>
                         Click on a point to see its associated code solution!</p></div>""")
db_desc = Div(text="<div style='width: 610px; font-size: 14px;'>The features are ordered by how well they classify the data into AI versus human written code.\
                                                  The accuracy of a classifier trained on the currently displayed features is shown under the legend.</div>")

# plot the decision boundaries
mesh = plot.square(source=cds_mesh, 
                   x=f"{starting_x}_{starting_y}_xx", 
                   y=f"{starting_x}_{starting_y}_yy", 
                   fill_color=f"{starting_x}_{starting_y}_colours", 
                   size=6, 
                   line_alpha=0, 
                   fill_alpha=0.2, 
                   name="Mesh")
mesh.nonselection_glyph = None

# plot the data points
scatterpoints = plot.scatter(source=cds_data, 
                             x=starting_x, 
                             y=starting_y, 
                             size=12, 
                             line_width=0.5, 
                             line_color="#686768", 
                             legend_field="source", 
                             name="Points",
                             marker=factor_mark(field_name="source", 
                                                markers=markers, 
                                                factors=category),
                             color=factor_cmap(field_name="source", 
                                               palette=colours, 
                                               factors=category))

# create a label for the accuracy of the classifier being shown
accuracy_label = Label(x=390, 
                       y=340, 
                       x_units="screen", 
                       y_units="screen", 
                       text_font_size="14px", 
                       text_font_style="bold", 
                       text_baseline="bottom",
                       text=all_accuracy[f"{starting_x}_{starting_y}"] + "% accuracy", 
                       render_mode="canvas")
plot.axis.axis_label_text_font_style = 'normal'

db_label_1 = Label(x=385, 
                   y=385, 
                   x_units="screen", 
                   y_units="screen",
                   text="AI Predictions", 
                   render_mode="canvas",
                   border_line_alpha=0.2,
                   background_fill_alpha=0.4,
                   text_font_size="14px",
                   background_fill_color=db_colours[0])

db_label_2 = Label(x=385, 
                   y=365, 
                   x_units="screen", 
                   y_units="screen",
                   text="Human Predictions", 
                   render_mode="canvas",
                   border_line_alpha=0.2,
                   background_fill_alpha=0.2,
                   text_font_size="14px",
                   background_fill_color=db_colours[1])

# click to show the code text
code_text = Div()
callback_click = CustomJS(args=dict(source=cds_data, 
                                    div=code_text), 
    code="""
    var index = source.selected.indices
    if (index.length == 0) {
        div.text = "";
        div.style = {};
    }
    else {
        div.text = source.data['name'][index[0]] + '<pre>' + source.data['code'][index[0]] + '</pre>';
        div.style = {'border': '1px solid black', 'padding': '10px'};
    }
    """)
cds_data.selected.js_on_change('indices', callback_click)

# select the x and y variables
select_x = Select(title="Choose the feature for the x-axis:", 
                  value=starting_x, 
                  options=all_features)
select_y = Select(title="Choose the feature for the y-axis:", 
                  value=starting_y, 
                  options=all_features)
callback_select = CustomJS(args=dict(scatterpoints_renderer=scatterpoints, 
                                     mesh_renderer=mesh,
                                     x_select=select_x, 
                                     y_select=select_y, 
                                     accuracy_label=accuracy_label, 
                                     accuracy_dict=all_accuracy,
                                     xaxis=plot.xaxis[0], 
                                     yaxis=plot.yaxis[0]),             
    code="""
    scatterpoints_renderer.glyph.x = {field: x_select.value};
    scatterpoints_renderer.glyph.y = {field: y_select.value};
    
    xaxis.axis_label = x_select.value;
    yaxis.axis_label = y_select.value;
    
    var current_vars = x_select.value + "_" + y_select.value;
    var current_xx = current_vars + "_xx";
    var current_yy = current_vars + "_yy";
    var current_colour = current_vars + "_colours";
    mesh_renderer.glyph.x = {field: current_xx};
    mesh_renderer.glyph.y = {field: current_yy};
    mesh_renderer.glyph.fill_color = {field: current_colour};
    
    accuracy_label.text = accuracy_dict[current_vars] + "% accuracy";
    
""")

select_x.js_on_change('value', callback_select)
select_y.js_on_change('value', callback_select)

# layout of the whole plot
layout = row(column(select_x, select_y, db_desc, plot, click_desc), code_text)
for label in [accuracy_label, db_label_1, db_label_2]:
    plot.add_layout(label)
show(layout)