In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_curve

# Modelling with embedding approach

In [2]:
embed_data = pd.read_csv("ss21_processed_solutions/m0_embedding_data.csv")
embed_data["binary_source"] = embed_data["source"].apply(lambda row: row if row=="student" else "ai")

embed_features = list(embed_data.columns[6:-1])

In [3]:
X_train_embed, X_test_embed, y_train_embed, y_test_embed = train_test_split(embed_data[embed_features].values, embed_data["binary_source"], test_size=0.3, random_state=0)

In [4]:
embed_model = GridSearchCV(RandomForestClassifier(random_state=0), param_grid={"n_estimators":np.arange(100,200,10),
                                                                             "max_depth":[10, 20, 30, None], 
                                                                             "max_features":[1, 0.9, 0.8, "sqrt", "log2"]}) 
embed_model.fit(X_train_embed, y_train_embed)
embed_model_results = pd.DataFrame(embed_model.cv_results_)
embed_model_results[embed_model_results["rank_test_score"]==1].head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
17,2.619382,0.038713,0.014213,0.0004006626,10,0.9,170,"{'max_depth': 10, 'max_features': 0.9, 'n_esti...",0.857143,0.964286,0.928571,1.0,0.925926,0.935185,0.047472,1
18,2.795943,0.034647,0.015214,0.0004002095,10,0.9,180,"{'max_depth': 10, 'max_features': 0.9, 'n_esti...",0.857143,0.964286,0.928571,1.0,0.925926,0.935185,0.047472,1
19,2.981312,0.069427,0.016015,2.336015e-07,10,0.9,190,"{'max_depth': 10, 'max_features': 0.9, 'n_esti...",0.857143,0.964286,0.928571,1.0,0.925926,0.935185,0.047472,1
67,2.592957,0.021904,0.014213,0.0004009972,20,0.9,170,"{'max_depth': 20, 'max_features': 0.9, 'n_esti...",0.857143,0.964286,0.928571,1.0,0.925926,0.935185,0.047472,1
68,2.753094,0.042828,0.015216,0.0003995321,20,0.9,180,"{'max_depth': 20, 'max_features': 0.9, 'n_esti...",0.857143,0.964286,0.928571,1.0,0.925926,0.935185,0.047472,1


In [5]:
embed_model = RandomForestClassifier(random_state=0, max_depth=10, max_features=0.9, n_estimators=190)
embed_model.fit(X_train_embed, y_train_embed)
print("Accuracy on test set:", np.round(accuracy_score(y_test_embed, embed_model.predict(X_test_embed)), 3)*100, "%")

Accuracy on test set: 96.7 %


In [6]:
misclassified = y_test_embed != embed_model.predict(X_test_embed)
(embed_data.loc[misclassified.index[misclassified]]["source"].value_counts() / embed_data.loc[y_test_embed.index]["source"].value_counts()).dropna() * 100

student    6.451613
Name: source, dtype: float64

## Classifying each source of data

In [7]:
for source in embed_data["source"].unique()[:-1]:
    embed_model = RandomForestClassifier(random_state=0, max_depth=10, max_features=0.9, n_estimators=190)
    embed_model.fit(embed_data[embed_data["source"] != source][embed_features], embed_data[embed_data["source"] != source]["binary_source"])
    print(f"Accuracy on classifying {source}:", np.round(accuracy_score(embed_data[embed_data["source"] == source]["binary_source"], 
                                                            embed_model.predict(embed_data[embed_data["source"] == source][embed_features])), 3)*100, "%")

Accuracy on classifying gpt3.5: 0.0 %
Accuracy on classifying bing: 46.400000000000006 %
Accuracy on classifying bard: 0.0 %
Accuracy on classifying gpt4: 51.6 %


## Testing the optimised embedded solutions on the model

In [8]:
optimised_data = pd.read_csv("ss21_processed_solutions/optimised_embedding_data.csv")
optimised_data["binary_source"] = optimised_data["source"].apply(lambda row: row if row=="student" else "ai")

In [9]:
full_model = RandomForestClassifier(random_state=0, max_depth=10, max_features=0.9, n_estimators=190)
full_model.fit(embed_data[embed_features].values, embed_data["binary_source"])

RandomForestClassifier(max_depth=10, max_features=0.9, n_estimators=190,
                       random_state=0)

In [10]:
predictions = pd.concat([pd.DataFrame(full_model.predict_proba(optimised_data[embed_features].values), columns=["Probability AI", "Probability Human"]),
                        optimised_data.reset_index(drop=True)], axis=1)
predictions.iloc[:,:8].sort_values("Probability Human")

Unnamed: 0,Probability AI,Probability Human,source,milestone,name,style,version,code
9,0.947368,0.052632,optimised,m0,13,,,package thkoeln.st.st2praktikum.exercise;\n\ni...
17,0.9,0.1,optimised,m0,20,,,package thkoeln.st.st2praktikum.exercise;\n\ni...
14,0.884211,0.115789,optimised,m0,18,,,package thkoeln.st.st2praktikum.exercise;\n\ni...
2,0.863158,0.136842,intervene,m0,natasha2,,,package thkoeln.st.st2praktikum.exercise;\n\ni...
19,0.821053,0.178947,optimised,m0,4,,,package thkoeln.st.st2praktikum.exercise;\n\ni...
6,0.773684,0.226316,optimised,m0,10,,,package thkoeln.st.st2praktikum.exercise;\n\ni...
8,0.768421,0.231579,optimised,m0,12,,,package thkoeln.st.st2praktikum.exercise;\n\ni...
16,0.757895,0.242105,optimised,m0,2,,,package thkoeln.st.st2praktikum.exercise;\n\ni...
10,0.721053,0.278947,optimised,m0,14,,,package thkoeln.st.st2praktikum.exercise;\n\ni...
21,0.7,0.3,optimised,m0,6,,,package thkoeln.st.st2praktikum.exercise;\n\ni...
