In [1]:
import os
import gc
import sys
from joblib import Parallel, delayed

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tokenizers import (
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    Tokenizer,
    SentencePieceBPETokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

sys.path.append(f"{os.getcwd()[:-10]}/main")
from utils import save_pickle

  from .autonotebook import tqdm as notebook_tqdm


## Method
- N-fold group-by (prompt-wise) split.
- Original train scripts are always included in the validation set.


## Parameters

In [2]:
EXTRA_INDS = []  # np.arange(0, 11)
N_FOLDS = 5
USE_FOLDS = 1
ROOT = "../input"
SEED = 10
LOWERCASE = False
VOCAB_SIZE = 30522
VECTORIZER_VOCAB_SAMPLE_RATIO = 0.05
DROPOUT = None  # None or 0 to 1
N_DATA_DUPL = 1
PROCESSED_PATH = f"{ROOT}/230109_on-the-fly_0.05_v2"
NORMALIZER = normalizers.NFC()  # {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
os.makedirs(f"{PROCESSED_PATH}", exist_ok=True)  # splitted x vectorized

In [3]:
test = pd.read_csv(f"{ROOT}/llm-detect-ai-generated-text/test_essays.csv")
sub = pd.read_csv(f"{ROOT}/llm-detect-ai-generated-text/sample_submission.csv")
org_train = pd.read_csv(f"{ROOT}/llm-detect-ai-generated-text/train_essays.csv")
org_train = org_train.sample(frac=1.0, random_state=SEED)
org_train.reset_index(drop=True, inplace=True)
train = pd.read_csv(f"{ROOT}/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=",")
train = train.drop_duplicates(subset=["text"])
train = train.sample(frac=1.0, random_state=SEED)
train.reset_index(drop=True, inplace=True)

## Extra dataset
Load, compare columns of v2 & v3 then merge selected extra 

In [4]:
train_v2 = pd.read_csv(f"{ROOT}/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=",")
train_v3 = pd.read_csv(f"{ROOT}/daigt-v3-train-dataset/train_v3_drcat_02.csv", sep=",")
v2_src = np.unique(train_v2.source)
v3_src = np.unique(train_v3.source)
v3_exclusive = []
for src in v3_src:
    if src not in v2_src:
        v3_exclusive.append(src)
v3_exclusive = sorted(v3_exclusive)
print(v3_exclusive)
for ind in EXTRA_INDS:
    extra = train_v3.loc[train_v3.source == v3_exclusive[ind]]
    train = pd.concat([train, extra], axis=0)

['Intel-neural-chat-7b-v3-1_LLMEssays_v1', 'Mistral7B_CME_v7', 'llama_falcon_v3_falcon_180b', 'llama_falcon_v3_llama_70b', 'nima_gpt4', 'text-ada-001', 'text-babbage-001', 'text-curie-001', 'text-davinci-001', 'text-davinci-002', 'text-davinci-003']


Sample each source for vocab calculation for the tfidfvectorizer

In [5]:
list_source = np.unique(train["source"])
list_source
texts_sampled_for_vectorizer_vocab = pd.DataFrame()
texts_remains = pd.DataFrame()
for source in list_source:
    train_one_source = train.loc[train.source == source]
    n_samples = int(len(train_one_source) * VECTORIZER_VOCAB_SAMPLE_RATIO)
    remain_one_source = train_one_source.iloc[n_samples:]
    train_one_source = train_one_source.iloc[:n_samples]
    texts_remains = pd.concat([texts_remains, remain_one_source])
    texts_sampled_for_vectorizer_vocab = pd.concat(
        [texts_sampled_for_vectorizer_vocab, train_one_source]
    )
# Also add org_train
texts_sampled_for_vectorizer_vocab = pd.concat(
    [
        texts_sampled_for_vectorizer_vocab,
        org_train.iloc[: int(len(org_train) * VECTORIZER_VOCAB_SAMPLE_RATIO)],
    ]
)
len(texts_sampled_for_vectorizer_vocab)

2307

In [6]:
# daigt-v2
train.drop(["source", "RDizzl3_seven"], axis=1, inplace=True)
train.rename({"label": "generated"}, axis=1, inplace=True)
train.head(3)

Unnamed: 0,text,generated,prompt_name
0,"In recent years, there has been a growing move...",1,Car-free cities
1,Although in theory a student-designed project ...,0,Summer projects
2,What makes a person seek advice from multiple ...,0,Seeking multiple opinions


# Split

Make split arrays using hash map.

In [7]:
prompts = np.unique(train["prompt_name"])
prompts_split_arr = np.arange(0, len(prompts)) % N_FOLDS
np.random.seed(seed=SEED)
np.random.shuffle(prompts_split_arr)
prompts_dict = {prompts[i]: prompts_split_arr[i] for i in range(len(prompts))}
prompts_split_arr

array([3, 2, 2, 1, 3, 2, 0, 0, 1, 4, 1, 0, 4, 3, 4])

In [8]:
train["fold"] = train["prompt_name"].apply(lambda x: prompts_dict[x])
train.drop("prompt_name", axis=1, inplace=True)
print(train.head(3))
print(np.unique(train.fold.values, return_counts=True))
file_name = f"{ROOT}/merge_{N_FOLDS}fold_seed{SEED}_train.csv"
train.to_csv(f"{ROOT}/merge_{N_FOLDS}fold_seed{SEED}_train.csv", index=False)

                                                text  generated  fold
0  In recent years, there has been a growing move...          1     2
1  Although in theory a student-designed project ...          0     3
2  What makes a person seek advice from multiple ...          0     4
(array([0, 1, 2, 3, 4]), array([ 6009,  8253, 11270, 10151,  9185]))


## Tokenize splitted folds and save

In [9]:
def tokenize_texts(raw_tokenizer, texts):
    tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=raw_tokenizer,
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]",
    )
    tokenized_texts = []
    for text in tqdm(texts["text"].tolist()):
        tokenized_texts.append(tokenizer.tokenize(text))
    del raw_tokenizer, tokenizer
    gc.collect()
    return tokenized_texts


def sentence_piece_bpe_tokenizer(train, org_train, sampled, remains, test):
    raw_tokenizer = SentencePieceBPETokenizer()
    raw_tokenizer.normalizer = normalizers.Sequence(
        [NORMALIZER] + [normalizers.Lowercase()] if LOWERCASE else []
    )
    raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    merged_pd = pd.concat([train, org_train])
    dataset = Dataset.from_pandas(merged_pd[["text"]])

    def train_corp_iter():
        for i in range(0, len(dataset), 300):
            yield dataset[i : i + 300]["text"]

    raw_tokenizer.train_from_iterator(train_corp_iter())
    del dataset
    gc.collect()

    tokenized_texts_train = tokenize_texts(raw_tokenizer, train)
    tokenized_texts_org_train = tokenize_texts(raw_tokenizer, org_train)
    tokenized_texts_sampled = tokenize_texts(raw_tokenizer, sampled)
    tokenized_texts_remains = tokenize_texts(raw_tokenizer, remains)
    tokenized_texts_test = tokenize_texts(raw_tokenizer, test)
    return (
        tokenized_texts_train,
        tokenized_texts_org_train,
        tokenized_texts_sampled,
        tokenized_texts_remains,
        tokenized_texts_test,
    )


def bpe_tokenizer(train, org_train, sampled, remains, test):
    raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
    raw_tokenizer.normalizer = normalizers.Sequence(
        [NORMALIZER] + [normalizers.Lowercase()] if LOWERCASE else []
    )
    raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
    trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)
    merged_pd = pd.concat([train, org_train])
    dataset = Dataset.from_pandas(merged_pd[["text"]])

    def train_corp_iter():
        for i in range(0, len(dataset), 1000):
            yield dataset[i : i + 1000]["text"]

    raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
    del dataset
    gc.collect()

    tokenized_texts_train = tokenize_texts(raw_tokenizer, train)
    tokenized_texts_org_train = tokenize_texts(raw_tokenizer, org_train)
    tokenized_texts_sampled = tokenize_texts(raw_tokenizer, sampled)
    tokenized_texts_remains = tokenize_texts(raw_tokenizer, remains)
    tokenized_texts_test = tokenize_texts(raw_tokenizer, test)
    return (
        tokenized_texts_train,
        tokenized_texts_org_train,
        tokenized_texts_sampled,
        tokenized_texts_remains,
        tokenized_texts_test,
    )

In [10]:
def dummy(text):
    return text


def vectorizer_fit_sampled_vectorize_all(
    train_tokens, org_train_tokens, sampled_tokens, remain_tokens, test_tokens
):
    vectorizer = TfidfVectorizer(
        ngram_range=(3, 5),
        lowercase=False,
        sublinear_tf=True,
        analyzer="word",
        tokenizer=dummy,
        preprocessor=dummy,
        token_pattern=None,  # , strip_accents="unicode"
    )
    vectorizer.fit(sampled_tokens)
    vocab = vectorizer.vocabulary_

    vectorizer = TfidfVectorizer(
        ngram_range=(3, 5),
        lowercase=False,
        sublinear_tf=True,
        vocabulary=vocab,
        analyzer="word",
        tokenizer=dummy,
        preprocessor=dummy,
        token_pattern=None,  # , strip_accents="unicode"
    )
    vectorizer.fit(remain_tokens)
    tf_train = vectorizer.transform(train_tokens)
    tf_org_train = vectorizer.transform(org_train_tokens)
    tf_test = vectorizer.transform(test_tokens)
    del vectorizer, vocab
    gc.collect()
    return (tf_train, tf_org_train, tf_test)

In [11]:
def tokenizer_with_vectorizer(train, org_train, sampled, remains, test, option):
    train_y = train["generated"].values
    org_train_y = org_train["generated"].values

    if option == "sentence":
        (
            tokenized_texts_train,
            tokenized_texts_org_train,
            tokenized_texts_sampled,
            tokenized_texts_remains,
            tokenized_texts_test,
        ) = sentence_piece_bpe_tokenizer(train, org_train, sampled, remains, test)
    elif option == "bpe":
        (
            tokenized_texts_train,
            tokenized_texts_org_train,
            tokenized_texts_sampled,
            tokenized_texts_remains,
            tokenized_texts_test,
        ) = bpe_tokenizer(train, org_train, sampled, remains, test)

    tf_train, tf_org_train, tf_test = vectorizer_fit_sampled_vectorize_all(
        tokenized_texts_train,
        tokenized_texts_org_train,
        tokenized_texts_sampled,
        tokenized_texts_remains,
        tokenized_texts_test,
    )

    save_base = f"{PROCESSED_PATH}/{option}_seed{SEED}_"
    print(tf_train.shape)
    save_pickle(f"{save_base}train.pkl", [tf_train, train_y, train["fold"].values])
    save_pickle(f"{save_base}org_train.pkl", [tf_org_train, org_train_y])
    save_pickle(f"{save_base}test.pkl", tf_test)

    del (
        tokenized_texts_train,
        tokenized_texts_org_train,
        tokenized_texts_sampled,
        tokenized_texts_remains,
        tokenized_texts_test,
        tf_train,
        tf_org_train,
        tf_test,
    )
    gc.collect()


sampled = texts_sampled_for_vectorizer_vocab
remains = texts_remains
tokenizer_with_vectorizer(train, org_train, sampled, remains, test, "sentence")
tokenizer_with_vectorizer(train, org_train, sampled, remains, test, "bpe")
del (
    train,
    org_train,
    sampled,
    remains,
    test,
    texts_sampled_for_vectorizer_vocab,
    texts_remains,
)
gc.collect()






100%|██████████| 44868/44868 [00:20<00:00, 2153.66it/s]
100%|██████████| 1378/1378 [00:00<00:00, 1510.85it/s]
100%|██████████| 2307/2307 [00:01<00:00, 2134.64it/s]
100%|██████████| 42629/42629 [00:20<00:00, 2111.92it/s]
100%|██████████| 3/3 [00:00<00:00, 16644.06it/s]


(44868, 2233309)





100%|██████████| 44868/44868 [00:20<00:00, 2168.31it/s]
100%|██████████| 1378/1378 [00:00<00:00, 1544.55it/s]
100%|██████████| 2307/2307 [00:01<00:00, 2117.68it/s]
100%|██████████| 42629/42629 [00:20<00:00, 2117.69it/s]
100%|██████████| 3/3 [00:00<00:00, 17697.49it/s]


(44868, 2232727)


0