# AI Generated Text Detection with DNN and TFIDF
Forked and modified from: https://www.kaggle.com/code/lonnieqin/ai-generated-text-detection-with-dnn-and-tfidf
- DNN and TFIDF fit only on train: 0.923
- Add spell correction for test + Fit TFIDF on train + test: 0.913
- Add spell correction for test + Fit TFIDF on test: 0.917

These results could vary due to the random elements of this notebook, but they may provide some insights.


## Imports

In [1]:
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import TextVectorization
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import roc_auc_score
import os
import gc
import sys
from joblib import Parallel, delayed

2024-01-09 14:29:42.734329: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-09 14:29:42.860281: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-09 14:29:42.860348: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-09 14:29:42.876264: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-09 14:29:42.917351: I tensorflow/core/platform/cpu_feature_guar

In [2]:
DEBUG = False
IS_RERUN = False
PREPROCESSED_SAVE_DIR = "../input/230109_tfrecords_100000"
TFREC_CHUNK_SIZE = 5000  # rough size of the chunks
N_PROCESSES_TFREC_GEN = 4
os.makedirs(PREPROCESSED_SAVE_DIR, exist_ok=True)
if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    ROOT = "/kaggle"
    IS_RERUN = True
    pass
else:
    try:
        sub = pd.read_csv(
            "/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv"
        )
        sub.to_csv("submission.csv", index=False)
        ROOT = "/kaggle"
    except:
        sub = pd.read_csv("../input/llm-detect-ai-generated-text/sample_submission.csv")
        sub.to_csv("submission.csv", index=False)
        ROOT = "../input"
if not IS_RERUN and not DEBUG:
    sys.exit()

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## Configuration

In [4]:
class CFG:
    N_FOLDS = 5
    epochs = 100 if not DEBUG else 1
    SEED = 7
    EARLY_STOP_COUNT = 30
    batch_size = 128
    is_training = True

# Importing files and Feature Engineering

In [5]:
train = pd.read_csv(f"{ROOT}/merge_5fold_seed10_train.csv")
train = train.rename(columns={"generated": "label"})
test = pd.read_csv(f"{ROOT}/llm-detect-ai-generated-text/test_essays.csv")

org_train = pd.read_csv(f"{ROOT}/llm-detect-ai-generated-text/train_essays.csv")
org_train = org_train.sample(frac=1.0, random_state=CFG.SEED)
org_train.reset_index(drop=True, inplace=True)
org_train

Unnamed: 0,id,prompt_id,text,generated
0,3fdf1455,0,Fellow citizens cars are dying out. Every year...,0
1,6e5e4f34,1,"Forida senator, I argue to remove the Electora...",0
2,03c28f3e,1,"dear senator, I have come to a conclusion on h...",0
3,21313307,0,Cars are an essential part of life they get us...,0
4,59395fb5,1,"Dear, State senator Voting is a public thing t...",0
...,...,...,...,...
1373,29cc8b5f,1,I have been burdened with the fact that our na...,0
1374,624e199a,0,Our world has became a world that revolves aro...,0
1375,66e2c504,1,For years I've heard that voting time is the c...,0
1376,e38718b0,0,Do you think you would be able to live without...,0


In [6]:
vectorizer = TextVectorization(
    max_tokens=100000, output_mode="tf-idf", ngrams=(3, 7)
)  # 3,7 from https://www.kaggle.com/code/yongsukprasertsuk/ai-generated-text-mod-weight-add-more-data-0-928?scriptVersionId=151460374
merged_pd = pd.concat([train, test, org_train])
vectorizer.adapt(merged_pd["text"])

2024-01-09 14:29:52.438359: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14298 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4080, pci bus id: 0000:01:00.0, compute capability: 8.9


## TFRecords generation
Necessary to maximize the training speed

In [9]:
def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()  # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))


def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def serialize_example(text_vector, label):
    feature = {
        "text_vector": _bytes_feature(text_vector),
        "label": _int64_feature(label),
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [10]:
# Parallel processing?
def write_tfrecord(process_ind, data, file_name, border_chunks):
    with tf.io.TFRecordWriter(
        f"{PREPROCESSED_SAVE_DIR}/{file_name}_{process_ind}.tfrec"
    ) as writer:
        cnt = 0
        text_vectors = vectorizer(
            data.iloc[border_chunks[process_ind] : border_chunks[process_ind + 1]][
                "text"
            ]
        )

        labels = data.iloc[border_chunks[process_ind] : border_chunks[process_ind + 1]][
            "label"
        ].values
        for i in range(border_chunks[process_ind], border_chunks[process_ind + 1]):
            example = serialize_example(text_vectors[cnt], labels[cnt])
            writer.write(example)
            cnt += 1
        del text_vectors, labels
        gc.collect()
        tf.keras.backend.clear_session()


n_train_chunks = max(1, round(len(train) / TFREC_CHUNK_SIZE))
border_chunks = np.linspace(0, len(train), n_train_chunks + 1, dtype=int)
for i in tqdm(range(0, n_train_chunks)):
    write_tfrecord(i, data=train, file_name="train", border_chunks=border_chunks)

n_train_chunks = max(1, round(len(org_train) / TFREC_CHUNK_SIZE))
border_chunks = np.linspace(0, len(org_train), n_train_chunks + 1, dtype=int)
for i in tqdm(range(0, n_train_chunks)):
    write_tfrecord(
        i, data=org_train, file_name="org_train", border_chunks=border_chunks
    )

n_train_chunks = max(1, round(len(test) / TFREC_CHUNK_SIZE))
border_chunks = np.linspace(0, len(org_train), n_train_chunks + 1, dtype=int)
for i in tqdm(range(0, n_train_chunks)):
    write_tfrecord(
        i, data=org_train, file_name="org_train", border_chunks=border_chunks
    )
# Parallel(n_jobs=N_PROCESSES_TFREC_GEN)(
#    delayed(write_tfrecord_wrapper)(i) for i in range(0, n_train_chunks)
# )

100%|██████████| 13/13 [13:10<00:00, 60.79s/it]


## Modeling

In [None]:
def fbeta(y_true, y_pred, beta=1.0):
    y_true_count = tf.reduce_sum(y_true)
    ctp = tf.reduce_sum(y_true * y_pred)
    cfp = tf.reduce_sum((1.0 - y_true) * y_pred)
    beta_squared = beta * beta
    c_precision = tf.where(ctp + cfp == 0.0, 0.0, ctp / (ctp + cfp))
    c_recall = tf.where(y_true_count == 0.0, 0.0, ctp / y_true_count)
    return tf.where(
        c_precision + c_recall == 0,
        0.0,
        tf.divide(
            (1.0 + beta_squared) * (c_precision * c_recall),
            (beta_squared * c_precision + c_recall),
        ),
    )


def inference(model, X_val):
    if "keras" in str(type(model)):
        y_pred = model.predict(X_val, verbose=2).reshape(-1)
    else:
        y_pred = model.predict_proba(X_val)[:, 1].reshape(-1)
    return y_pred


def evaluate_model(model, X_val, y_val):
    y_pred = inference(model, X_val)
    auc = roc_auc_score(y_val, y_pred)
    print(f"AUC for {model}: {auc}")
    return {"model": model, "auc": auc}


def make_dataset(X, y, batch_size, mode):
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    if mode == "train":
        dataset = dataset.shuffle(batch_size * 4)
    dataset = dataset.batch(batch_size)
    dataset = dataset.cache().prefetch(tf.data.AUTOTUNE)
    return dataset


def get_model():
    inputs = keras.Input(shape=(), dtype=tf.string)
    x = vectorizer(inputs)
    print(x.shape)
    x = layers.Dense(32, activation="swish")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(16, activation="swish")(x)
    x = layers.Dropout(0.3)(x)
    output = layers.Dense(1)(x)
    model = keras.Model(inputs, output, name="model")
    model.compile(
        optimizer=tf.keras.optimizers.Adam(4e-4),
        loss=tf.keras.losses.BinaryCrossentropy(
            from_logits=True,
            label_smoothing=0.2,
        ),
        metrics=["accuracy", keras.metrics.AUC(name="auc"), fbeta],
    )
    return model


def train_models(X_train, y_train, X_val, y_val, fold):
    model_path = f"model_{fold}.tf"
    checkpoints = []
    if CFG.is_training:
        model = get_model()
        train_ds = make_dataset(X_train, y_train, CFG.batch_size, "train")
        valid_ds = make_dataset(X_val, y_val, CFG.batch_size, "valid")
        model.fit(
            train_ds,
            epochs=CFG.epochs,
            validation_data=valid_ds,
            callbacks=[
                keras.callbacks.ReduceLROnPlateau(
                    patience=5, min_delta=1e-4, min_lr=1e-6
                ),
                keras.callbacks.ModelCheckpoint(
                    model_path, monitor="val_auc", mode="max", save_best_only=True
                ),
                keras.callbacks.EarlyStopping(
                    monitor="val_auc",
                    patience=CFG.EARLY_STOP_COUNT,
                    restore_best_weights=True,
                ),
            ],
        )
    else:
        model = keras.models.load_model(
            f"/kaggle/input/ai-generated-text-dnn-detector/model_{fold}.tf",
            custom_objects={"fbeta": fbeta},
        )
    checkpoints.append(evaluate_model(model, X_val, y_val))
    return checkpoints

In [None]:
model = get_model()
model.summary()
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
from sklearn.model_selection import StratifiedKFold

# kfold = StratifiedKFold(5, shuffle=True, random_state=42)
preds = []
# for fold, (train_index, valid_index) in enumerate(kfold.split(train, train["label"])):
folds = train.fold.values
X_org_train = org_train["text"]
y_org_train = org_train["generated"]
for fold in range(0, CFG.N_FOLDS):
    X_train = train.loc[folds != fold]["text"]
    y_train = train.loc[folds != fold]["label"]
    X_val = train.loc[folds == fold]["text"].values
    y_val = train.loc[folds == fold]["label"].values
    X_val = np.hstack([X_val, X_org_train])
    y_val = np.hstack([y_val, y_org_train])
    model = train_models(X_train, y_train, X_val, y_val, fold)
    preds.append(inference(model[0]["model"], test["text"].values))
    del model
    gc.collect()
    tf.keras.backend.clear_session()

## Create Submission

In [None]:
# test["generated"] = np.mean([inference(model["model"], test["text"]) for model in models], axis=0)
test["generated"] = np.mean(preds, axis=0)
test[["id", "generated"]].to_csv("submission.csv", index=False)
test[["id", "generated"]].head()