# Notebook to Train the Classifiers

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import time
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import logging
import csv

In [None]:
!pip install keras-tuner

### if trained in Colab, and personal drive should be mounted

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


# List of Features per Feature Category
Two types of lists created per feature category: Only traditional feature & traditional+new features

In [None]:
# Feature groups
PPL_based = ['ppl_max', 'ppl_mean']

Semantic_feat = ['sentiment_polarity', 'sentiment_subjectivity']
list_lookup_trad = ['stop_word_count','personal_pronoun_relative', 'personal_pronoun_count']
list_lookup_all = ['discourse_marker_count', 'stop_word_count','title_repetition_count',
                   'title_repetition_relative','personal_pronoun_relative', 'personal_pronoun_count']

error_based = ['grammar_error_count','multi_blank_count',]

readability = ['flesch_reading_ease', 'flesch_kincaid_grade_level',]

AI_feedback = ['ai_feedback']

text_vector = ['sentence_bert', 'sentence_bert_dist', 'tfidf']

doc_and_corp = ['words_per_paragraph_mean', 'words_per_paragraph_stdev',
                'sentences_per_paragraph_mean', 'sentences_per_paragraph_stdev',
                'words_per_sentence_mean', 'words_per_sentence_stdev',
                'unique_words_per_sentence_mean', 'unique_words_per_sentence_stdev',
                'character_count','words_count','unique_words_count', 'unique_words_relative',
                'sentence_count','punctuation_count', 'paragraph_count','quotation_count',
                'pos_per_sentence_mean', 'uppercase_letters_relative', 'special_char_count',]
doc_and_corp_trad = ['words_per_paragraph_mean', 'words_per_paragraph_stdev',
                    'sentences_per_paragraph_mean', 'sentences_per_paragraph_stdev',
                    'words_per_sentence_mean', 'words_per_sentence_stdev',
                    'unique_words_per_sentence_mean','character_count','words_count','unique_words_count', 
                    'unique_words_relative', 'sentence_count','punctuation_count', 'paragraph_count',
               'pos_per_sentence_mean', 'uppercase_letters_relative', 'special_char_count']

all_f = PPL_based + Semantic_feat+list_lookup_all+error_based+readability+AI_feedback+text_vector+doc_and_corp
all_trad = ['sentence_bert', 'tfidf', 'flesch_reading_ease', 'flesch_kincaid_grade_level','words_per_paragraph_mean',
            'words_per_paragraph_stdev', 'sentences_per_paragraph_mean', 'sentences_per_paragraph_stdev',
            'words_per_sentence_mean', 'words_per_sentence_stdev', 'unique_words_per_sentence_mean',
            'character_count','words_count','unique_words_count', 'unique_words_relative',
            'sentence_count','punctuation_count', 'paragraph_count', 'personal_pronoun_relative',
            'personal_pronoun_count','pos_per_sentence_mean', 'uppercase_letters_relative', 'special_char_count',
            'stop_word_count', 'sentiment_polarity',]

# Training Functions

In [None]:
# Returns datasets for one on the five folds

def get_datasets(df, fold):
    filtered_df_train = df.loc[df["Fold_{}".format(fold)] == "train"]
    filtered_df_val = df.loc[df["Fold_{}".format(fold)] == "val"]
    filtered_df_test = df.loc[df["Fold_{}".format(fold)] == "test"]

    train_labels = filtered_df_train['author'].to_numpy()
    val_labels = filtered_df_val['author'].to_numpy()
    test_labels = filtered_df_test['author'].to_numpy()

    train_df_selected = filtered_df_train.drop(columns=['author', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4','Fold_5'])
    train_features_arr = np.array(train_df_selected)

    val_df_selected = filtered_df_val.drop(columns=['author', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4','Fold_5'])
    val_features_arr = np.array(val_df_selected)

    test_df_selected = filtered_df_test.drop(columns=['author', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4','Fold_5'])
    test_features_arr = np.array(test_df_selected)

    # labels to numbers
    train_labels = np.where(train_labels == "human", 0, 1)
    val_labels = np.where(val_labels == "human", 0, 1)
    test_labels = np.where(test_labels == "human", 0, 1)

    return train_features_arr, train_labels, val_features_arr, val_labels, test_features_arr, test_labels

In [None]:
# Logging of the results

def log_results(modeltype,data,feature,fold,acc,f1,best_params ,time):
    # Open CSV file in append mode
    with open('drive/MyDrive/Models/model_trainings_log.csv', mode='a', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=';')

        # Log a message and write to CSV file
        logging.info('This is a log message.')
        writer.writerow([modeltype,data,feature,fold,acc,f1,best_params ,time])

def log_run_metrics(modeltype,data,feature,avg_acc, avg_f1, lang, source, time):
    # Open CSV file in append mode
    with open('drive/MyDrive/Models/train_run_results.csv', mode='a', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=';')

        # Log a message and write to CSV file
        logging.info('This is a log message.')
        writer.writerow([modeltype,data,feature,avg_acc, avg_f1, lang, source, time])

In [None]:
# functions to train XGBoost, Random Forest, and the Neural Network

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
from sklearn.model_selection import PredefinedSplit
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import Adam, SGD
import csv
import logging

import keras_tuner
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from datetime import datetime

def build_model(hp):
    model = keras.Sequential()
    for i in range(hp.Int("num_layers", 2, 5)):
        model.add(keras.layers.Dense(
          hp.Choice('units', [16, 32,64]),
          activation='relu'))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer="adam")
    return model


def train_nn(X_train, y_train, X_val, y_val, X_test, y_test, fold, data, feature):

    now = datetime.now()
    time_now = now.strftime("%y%m%d_%H%M%S")

    tuner = keras_tuner.GridSearch(
        build_model,
        objective='val_accuracy',
        max_trials=25, directory = "./drive/MyDrive/Models/NN/{}/".format(time_now))

    X_train = np.asarray(X_train).astype('float32')
    X_val = np.asarray(X_val).astype('float32')
    X_test = np.asarray(X_test).astype('float32')

    # normalize data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    tuner.search(X_train, y_train, epochs=25, validation_data=(X_val, y_val))
    best_model = tuner.get_best_models()[0]

    preds = best_model.predict(X_test)

    # Calculate the accuracy score on the test data
    accuracy_test = accuracy_score(y_test, (preds > 0.5))
    f1 = f1_score(y_test, (preds > 0.5))

    best_params = {"units": tuner.get_best_hyperparameters()[0].get("units"),
                   "num_layers": tuner.get_best_hyperparameters()[0].get("num_layers")}

    # Print the best hyperparameters
    print("Best hyperparameters:", best_params)

    log_results("NN", data,feature, fold,accuracy_test, f1, best_params, time_now)

    print("Test accuracy:", accuracy_test)

    return accuracy_test, f1


def train_rf():
    # Set up the parameter grid to search over
    param_grid = {
        'n_estimators': [50, 100, 500, 1000],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10],
        'max_leaf_nodes': [3, 6, 9]
    }

    # Create a random forest classifier
    rf = RandomForestClassifier(random_state=42)

    return rf, param_grid


def train_xgb():
    # Set up the parameter grid to search over
    param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 250, 500, 750],
    'lambda': [1.25, 2],
    'alpha': [0.25, 0.75]
    }

    # Create an instance of the XGBoost classifier
    xgb = XGBClassifier(random_state=42)

    return xgb, param_grid


def train_ml_approaches(X_train, y_train, X_val, y_val, X_test, y_test, fold, data, feature, model_type):
    """
    Trains and hyperparameter tunes a binary random forest classifier using GridSearchCV
    with train, validation, and test datasets.
    """

    now = datetime.now()
    time_now = now.strftime("%y%m%d_%H%M%S")

    X = np.concatenate((X_train, X_val))
    y = np.concatenate((y_train, y_val))

    X = np.asarray(X).astype('float32')
    y = np.asarray(y).astype('float32')

    val_fold = [-1] * len(X)
    for i, value in enumerate(val_fold):
        if i > len(X_train):
            val_fold[i] = 0

    ps = PredefinedSplit(val_fold)

    if model_type == "rf":
        classifier, param_grid = train_rf()
    elif model_type == "xgb":
        classifier, param_grid = train_xgb()

    # Create the GridSearchCV object
    grid_search = GridSearchCV(classifier, param_grid, cv=ps, scoring='accuracy',n_jobs=50)

    # Fit the grid search to the training data
    grid_search.fit(X, y)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Print the best hyperparameters
    print("Best hyperparameters:", grid_search.best_params_)

    # Predict on the validation data
    y_pred_val = best_model.predict(X_val)

    # Calculate the accuracy score on the validation data
    accuracy_val = accuracy_score(y_val, y_pred_val)

    print("Validation accuracy:", accuracy_val)

    # Predict on the test data
    y_pred_test = best_model.predict(X_test)

    # Calculate the accuracy score on the test data
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1 = f1_score(y_test, y_pred_test)

    log_results(model_type, data,
                feature, fold,
                accuracy_test, f1,
                grid_search.best_params_
                , time_now)

    print("Test accuracy:", accuracy_test)

    return accuracy_test, f1


In [None]:
# Run training for all 3 classifiers

def define_and_exec_run(features_list, feature_name, gpt_data, wiki_path, gpt_path, data, lang):

    now = datetime.now()
    time_now = now.strftime("%y%m%d_%H%M%S")

    features = features_list.copy()
    wiki_features_df = pd.read_pickle(wiki_path)
    gpt_features_df = pd.read_pickle(gpt_path)

    gpt_features_df = gpt_features_df[gpt_features_df["source"] == gpt_data]

    # if tf idf and/or sentence bert used, this list contains the new column names
    tf_idf_col_names = []
    sent_bert_col_names = []

    if "tfidf" in features:
        col_name_wiki = "tfidf_{}".format(gpt_data)

        for i in range(len(wiki_features_df[col_name_wiki].iloc[0])):
            wiki_features_df[f'tfidf{i+1}'] = wiki_features_df[col_name_wiki].apply(lambda x: x[i] if i < len(x) else None)
        wiki_features_df = wiki_features_df.drop(columns=["tfidf_rephrase_base", "tfidf_rephrase_expert", "tfidf_generated_base", "tfidf_generated_expert"])

        for i in range(len(gpt_features_df["tfidf"].iloc[0])):
            gpt_features_df[f'tfidf{i+1}'] = gpt_features_df["tfidf"].apply(lambda x: x[i] if i < len(x) else None)
            tf_idf_col_names.append("tfidf{}".format(i+1))
        gpt_features_df = gpt_features_df.drop(columns=["tfidf"])

        # drop initial feature
        features.remove("tfidf")


    df = pd.concat((wiki_features_df, gpt_features_df))
    print(df)

    if "sentence_bert" in features:
        for i in range(len(df["sentence_bert"].iloc[0])):
            df[f'sentence_bert{i+1}'] = df["sentence_bert"].apply(lambda x: x[i] if i < len(x) else None)
            sent_bert_col_names.append("sentence_bert{}".format(i+1))
        df = df.drop(columns=["sentence_bert"])

        # drop initial feature
        features.remove("sentence_bert")

    print(features)
    print(sent_bert_col_names)
    print(tf_idf_col_names)

    train_df = df[features
                     + ["author", "Fold_1", "Fold_2", "Fold_3", "Fold_4", "Fold_5"]
                     + sent_bert_col_names
                     + tf_idf_col_names]
    print(train_df.shape)
    print(train_df)

    print("RF run for {} with name {} and features {}".format(gpt_data, feature_name, features))
    metrics = []
    for i in range(1,6):
        print("Iteration {}".format(i))
        x_train, y_train, x_val, y_val, x_test, y_test = get_datasets(train_df, i)

        acc, f1 = train_ml_approaches(x_train, y_train, x_val, y_val, x_test, y_test, i, gpt_data, features, "rf")
        metrics.append([acc, f1])

    avg_metrics = np.average(metrics,axis=0)
    log_run_metrics("rf", gpt_data, feature_name, avg_metrics[0], avg_metrics[1], lang, data, time_now)


    print("XGB run for {} with name {} and features {}".format(gpt_data, feature_name, features))
    metrics = []
    for i in range(1,6):
        print("Iteration {}".format(i))
        x_train, y_train, x_val, y_val, x_test, y_test = get_datasets(train_df, i)

        acc, f1 = train_ml_approaches(x_train, y_train, x_val, y_val, x_test, y_test, i, gpt_data, features, "xgb")
        metrics.append([acc, f1])

    avg_metrics = np.average(metrics,axis=0)
    log_run_metrics("xgb", gpt_data, feature_name, avg_metrics[0], avg_metrics[1], lang, data, time_now)


    print("NN run for {} with name {} and features {}".format(gpt_data, feature_name, features))
    metrics = []
    for i in range(1,6):
        print("Iteration {}".format(i))
        x_train, y_train, x_val, y_val, x_test, y_test = get_datasets(train_df, i)
        print('Training Features Shape:', x_train.shape)

        acc, f1 = train_nn(x_train, y_train, x_val, y_val, x_test, y_test, i, gpt_data, features)
        metrics.append([acc, f1])

    avg_metrics = np.average(metrics,axis=0)
    log_run_metrics("nn", gpt_data, feature_name, avg_metrics[0], avg_metrics[1], lang, data, time_now)


In [None]:
wiki_path = "human_generated.pkl"
gpt_path="ai_generated.pkl"
lang="en"
data="wiki"

gpt_data="generated_base"
define_and_exec_run(all_trad, "all_trad", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(all_f, "all_with_new", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(['sentence_bert', 'tfidf'], "text_vector_trad", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(text_vector, "text_vector_all", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)

gpt_data="rephrase_base"
define_and_exec_run(all_trad, "all_trad", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(all_f, "all_with_new", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(['sentence_bert', 'tfidf'], "text_vector_trad", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(text_vector, "text_vector_all", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)

gpt_data="generated_expert"
define_and_exec_run(all_trad, "all_trad", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(all_f, "all_with_new", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(['sentence_bert', 'tfidf'], "text_vector_trad", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(text_vector, "text_vector_all", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)

gpt_data="rephrase_expert"
define_and_exec_run(all_trad, "all_trad", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(all_f, "all_with_new", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(['sentence_bert', 'tfidf'], "text_vector_trad", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(text_vector, "text_vector_all", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)

Trial 12 Complete [00h 00m 03s]
val_accuracy: 0.8500000238418579

Best val_accuracy So Far: 0.8999999761581421
Total elapsed time: 00h 00m 53s
Best hyperparameters: {'units': 32, 'num_layers': 5}
Test accuracy: 0.75


In [None]:
wiki_path = "human_generated.pkl"
gpt_path="ai_generated.pkl"
lang="en"
data="wiki"
gpt_data="rephrase_expert"

define_and_exec_run(PPL_based, "PPL_based", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(['sentiment_polarity'], "semantic_traditional", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(Semantic_feat, "semantic_all", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(list_lookup_trad, "list_lookup_trad", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(list_lookup_all, "list_lookup_all", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(doc_and_corp_trad, "doc_and_corp_traditional", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(doc_and_corp, "doc_and_corp_all", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(error_based, "error_based", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(readability, "readability_all", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(AI_feedback, "AI_feedback", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(['sentence_bert', 'tfidf'], "text_vector_trad", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(text_vector, "text_vector_all", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(all_trad, "all_trad", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
define_and_exec_run(all_f, "all_with_new", gpt_data, wiki_path=wiki_path, gpt_path=gpt_path, lang=lang, data=data)
