In [1]:
import keras.backend as K

print(K.backend())

torch


In [2]:
import nltk
import keras
import pandas as pd
import numpy as np
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import TextVectorization
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [3]:
"""
Download dataset SubtaskA.jsonl from 
https://github.com/mbzuai-nlp/M4GT-Bench.

Google drive from repo: https://drive.google.com/drive/folders/1hBgW6sgZfz1BK0lVdUu0bZ4HPKSpOMSY
Direct link to SubtaskA.jsonl: https://drive.google.com/file/d/1zwSfSKe4-0m2td_cP0Sl2LhKtksvtHlf/view
"""
import gdown, os
# DATA_PATH = "C:/Users/Admin/Downloads/SubtaskA.jsonl"
DATA_PATH = "./datasets/SubtaskA.jsonl"
if not os.path.exists(DATA_PATH):
    if not os.path.exists(os.path.dirname(DATA_PATH)):
        os.makedirs(os.path.dirname(DATA_PATH))
    gdown.download("https://drive.google.com/uc?id=1zwSfSKe4-0m2td_cP0Sl2LhKtksvtHlf", DATA_PATH, quiet=False)

# initialize dataframe
df = pd.read_json(DATA_PATH, lines=True)

In [4]:
print(df.source.value_counts())
print()
print(df.model.value_counts())

wikihow      36556
reddit       33999
arxiv        33998
wikipedia    31365
peerread     16891
Name: source, dtype: int64

human      65177
chatGPT    16892
gpt4       14344
davinci    14340
bloomz     14332
dolly      14046
cohere     13678
Name: model, dtype: int64


In [5]:
print(df[df.label == 0].model.value_counts())
print()
print(df[df.label == 1].model.value_counts())

human    65177
Name: model, dtype: int64

chatGPT    16892
gpt4       14344
davinci    14340
bloomz     14332
dolly      14046
cohere     13678
Name: model, dtype: int64


In [6]:
df[['text', 'label']]

Unnamed: 0,text,label
0,We consider a system of many polymers in solut...,1
1,We present a catalog of 66 YSOs in the Serpens...,1
2,Spectroscopic Observations of the Intermediate...,1
3,We present a new class of stochastic Lie group...,1
4,ALMA as the ideal probe of the solar chromosph...,1
...,...,...
152804,The main results presented in this dissertati...,0
152805,Fine-grained sketch-based image retrieval (FG...,0
152806,We present the derivation of the NNLO two-par...,0
152807,The principle of optimism in the face of unce...,0


In [7]:
"""
Pre-process dataframe.
"""
MAX_VOCAB = 10_000
MAX_LENGTH = 200

# init text vectorizer
vectorize_layer = TextVectorization(
    max_tokens=MAX_VOCAB,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=MAX_LENGTH,
    pad_to_max_tokens=False,
    vocabulary=None,
    idf_weights=None,
    sparse=False,
    ragged=False,
    encoding='utf-8',
    name=None,
)

# create vocabulary
vectorize_layer.adapt(df['text'])
vocab = vectorize_layer.get_vocabulary()

In [8]:
# vectorize text data (in subsets for memory constraints)
X = []
y = df['label']

subset_size = df.shape[0] // 100
for i in range(0, df.shape[0], subset_size):
    subset = df['text'][i : i + subset_size]
    X.append(vectorize_layer(subset).cpu())

X = np.vstack(X)
print(X.shape, y.shape)

(152809, 200) (152809,)


In [9]:
"""
LSTM model generator.
"""
EMBEDDING_DIM = 128
N_HIDDEN = 100
N_CLASSES = 2

import torch
class LSTMModel(torch.nn.Module):
    def __init__(self):
        super(LSTMModel, self).__init__()
        self.embeddings = torch.nn.Embedding(
            num_embeddings=MAX_VOCAB,
            embedding_dim=EMBEDDING_DIM,
        )
        self.lstm = torch.nn.LSTM(
            input_size=EMBEDDING_DIM,
            hidden_size=N_HIDDEN,
            num_layers=1,
            batch_first=True,
        )
        self.linear = torch.nn.Linear(N_HIDDEN, N_CLASSES)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embeddings(x)
        x, _ = self.lstm(x)
        x = self.linear(x[:, -1, :])
        x = self.softmax(x)
        return x
    
    def save(self, model_path):
        torch.save(self.state_dict(), model_path)

    def load(self, model_path):
        self.load_state_dict(torch.load(model_path, weights_only=True))


In [10]:
from tqdm import tqdm

def evaluate(model, loader):
    model.eval()
    loss = acc = 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            y_pred = model(X_batch)
            loss += model.loss_fn(y_pred, y_batch).item()
            acc += (y_pred.argmax(1) == y_batch).sum().item()
    return loss / len(loader), acc / len(loader.dataset)

def train(model, train_loader, val_loader, epochs, save_best=False):
    t_loss = t_acc = v_loss = v_acc = best_v_loss = best_v_acc = 0
    def get_postfix():
        return f"train loss: {t_loss:.3f}, acc: {t_acc * 100:.2f}% \
| val loss: {v_loss:.3f}, acc: {v_acc * 100:.2f}% \
| best val loss: {best_v_loss:.3f}, acc: {best_v_acc * 100:.2f}%"

    best_v_loss = float('inf')
    for epoch in range(epochs):
        model.train()
        total_train_loss = total_train_correct = 0
        with tqdm(train_loader, desc=f"Epoch {epoch + 1}") as pbar:
            for i, (X_batch, y_batch) in enumerate(train_loader):
                model.optimizer.zero_grad()
                y_pred = model(X_batch)
                loss = model.loss_fn(y_pred, y_batch)
                loss.backward()
                model.optimizer.step()

                batch_loss, num_correct = loss.item(), (y_pred.argmax(1) == y_batch).sum().item()
                total_train_loss += batch_loss
                total_train_correct += num_correct

                num_samples = (i + 1) * len(X_batch)
                t_loss = total_train_loss / num_samples
                t_acc = total_train_correct / num_samples
                
                pbar.set_postfix_str(get_postfix())
                pbar.update()

            t_loss = total_train_loss / len(train_loader)
            t_acc = total_train_correct / len(train_loader.dataset)

            v_loss, v_acc = evaluate(model, val_loader)

            if save_best and v_loss < best_v_loss:
                best_v_loss = v_loss
                best_v_acc = v_acc
                model.save("best_model.pth")

            pbar.set_postfix_str(get_postfix())
            pbar.update()


    # load best model
    if save_best:
        model.load("best_model.pth")

In [11]:
# Train LSTM model
model = LSTMModel()
model.optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)
model.loss_fn = torch.nn.CrossEntropyLoss()

In [12]:
# Prep data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_train = torch.tensor(X_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.long)
y_train = torch.tensor(y_train.values, dtype=torch.long)
y_val = torch.tensor(y_val.values, dtype=torch.long)
y_test = torch.tensor(y_test.values, dtype=torch.long)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
X_train = X_train.to(device)
X_val = X_val.to(device)
X_test = X_test.to(device)
y_train = y_train.to(device)
y_val = y_val.to(device)
y_test = y_test.to(device)

BATCH_SIZE = 128
train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_val, y_val), batch_size=BATCH_SIZE, shuffle=False)
test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_test, y_test), batch_size=BATCH_SIZE, shuffle=False)

train(model, train_loader, val_loader, epochs=20, save_best=True)
test_loss, test_acc = evaluate(model, test_loader)
print(f"Test loss: {test_loss:.3f}, acc: {test_acc * 100:.2f}%")

Epoch 1: 766it [00:18, 40.77it/s, train loss: 0.645, acc: 63.75% | val loss: 0.625, acc: 67.26% | best val loss: 0.625, acc: 67.26%]                         
Epoch 2: 766it [00:17, 42.71it/s, train loss: 0.580, acc: 71.82% | val loss: 0.562, acc: 73.78% | best val loss: 0.562, acc: 73.78%]                           
Epoch 3: 766it [00:19, 39.74it/s, train loss: 0.576, acc: 72.15% | val loss: 0.542, acc: 76.22% | best val loss: 0.542, acc: 76.22%]                           
Epoch 4: 766it [00:19, 39.34it/s, train loss: 0.515, acc: 79.14% | val loss: 0.501, acc: 80.43% | best val loss: 0.501, acc: 80.43%]                           
Epoch 5: 766it [00:19, 39.30it/s, train loss: 0.484, acc: 82.37% | val loss: 0.480, acc: 82.74% | best val loss: 0.480, acc: 82.74%]                           
Epoch 6: 766it [00:19, 39.27it/s, train loss: 0.466, acc: 84.38% | val loss: 0.472, acc: 83.59% | best val loss: 0.472, acc: 83.59%]                           
Epoch 7: 766it [00:19, 39.16it/s, train lo

Test loss: 0.427, acc: 88.36%


In [13]:
def predict(text: str):
    text = vectorize_layer([text]).cpu()
    text = torch.tensor(text).to(device)
    model.eval()
    y_pred = model(text)
    return y_pred.argmax(dim=1).item()

# Human text
print(predict("That's a reasonable hypothesis, and I tried that too. However the default resolvers are not returned but clearly they have to be there behind the scenes are the API wouldn't work. This is a bit of a flaky area. Wow -- even weirder. What you have to do is manually attach a resolver for each field on the type. So the default resolvers show up in the UI, and once you attach them you can export them. But again, those defaults had to be there originally. This is one of the more half-baked AWS services I've dealt with. It's good to work with the new ones ;)."))
print(predict("Although it is generally accepted that the internet has allowed us to connect with people all over the world, there are still those people who are not familiar with its basic functions, who don’t understand why it has become so commonplace, or what its true capabilities are."))

# AI generated text
print(predict("1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases. 2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week. 3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night."))

1
1
1


  text = torch.tensor(text).to(device)


In [14]:
from tqdm import tqdm


# Testing on essay data from https://www.kaggle.com/datasets/sunilthite/llm-detect-ai-generated-text-dataset
essay_data = pd.read_csv("datasets/Training_Essay_Data.csv")
# First column is the essay text, second column is the label, 1 if AI generated, 0 if human generated
print(essay_data.head())
# Predict on all essays
predictions = []
labels = []
for row in tqdm(essay_data.iterrows(), total=essay_data.shape[0]):
    text = row[1][0]
    label = row[1][1]
    labels.append(label)
    prediction = predict(text)
    predictions.append(prediction)
predictions = np.array(predictions)
labels = np.array(labels)
accuracy = np.mean(predictions == labels)
print("Accuracy: %.2f%%" % (accuracy * 100))

# Show distribution of predictions and labels
print("Predictions")
print(np.unique(predictions, return_counts=True))
print("Labels")
print(np.unique(labels, return_counts=True))

                                                text  generated
0  Car-free cities have become a subject of incre...          1
1  Car Free Cities  Car-free cities, a concept ga...          1
2    A Sustainable Urban Future  Car-free cities ...          1
3    Pioneering Sustainable Urban Living  In an e...          1
4    The Path to Sustainable Urban Living  In an ...          1


  text = torch.tensor(text).to(device)
100%|██████████| 29145/29145 [04:24<00:00, 110.28it/s]

Accuracy: 76.65%
Predictions
(array([0, 1]), array([13548, 15597], dtype=int64))
Labels
(array([0, 1]), array([17508, 11637], dtype=int64))



