In [1]:
import os
import gc
import sys
from glob import glob
from joblib import Parallel, delayed

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from tokenizers import (
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    Tokenizer,
    SentencePieceBPETokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

IS_KAGGLE = True
if os.path.isdir("/kaggle/input"):
    sys.path.append(
        "/kaggle/input/llm-detect-github/LLM-Detect-AI-Generated-Text-main/main"
    )
else:
    sys.path.append(f"{os.getcwd()[:-9]}/main")
    IS_KAGGLE = False
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, sum_models
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from scipy import sparse
from joblib import Parallel, delayed

from utils import (
    save_pickle,
    weighted_average_preds,
    OptunaEarlyStoppingCallback,
    last_fold_changer,
    models_excluder,
    append_preds,
)
from modules import model_objectives
from params import select_param_type
import optuna

  from .autonotebook import tqdm as notebook_tqdm


# Data preprocess

In [2]:
DEBUG = False
USE_PREV_ERROR = False

# preprocessing part
EXTRA_INDS = []  # np.arange(0, 11)
N_FOLDS = 11
MODEL_FOR_ENSEMBLE = "gpt"
MODELS_TO_EXCLUDE = ["davinci", "curie", "llama", "babbage", "ada", "palm"]
ROOT = "/kaggle/input" if IS_KAGGLE else "../input"
SAVE_ROOT = "/tmp" if IS_KAGGLE else "../input"
SEED = 99
LOWERCASE = False
VOCAB_SIZE = 30522
VECTORIZER_VOCAB_SAMPLE_RATIO = 0.01  # for test use all
PROCESSED_PATH = f"{SAVE_ROOT}/230110_0.05_v2_10folds"
NORMALIZER = normalizers.NFC()  # {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
os.makedirs(f"{PROCESSED_PATH}", exist_ok=True)  # splitted x vectorized

# model part
INPUT_TYPE = "sentence"  # sentence, bpe
MODEL = "LGBM"  # LGBM, XGB, CatBoost
LOWERCASE = False
N_ESTIMATORS = 2000
N_OPTUNA_TRIALS = 10
OPTUNA_EARLY_STOP_COUNT = 5
N_OPTUNA_ENSEMBLE_TRIALS = 10000
OPTUNA_ENSEMBLE_EARLY_STOP_COUNT = 1000
LGBM_SENTENCE_PARAMS =   {'verbosity': -1, 'random_state': 42, 'early_stopping_rounds': 100, 'metric': 'auc', 'learning_rate': 0.07286724587922155, 'colsample_bytree': 0.6797905694389136, 'colsample_bynode': 0.839733458908106, 'lambda_l1': 0.38796147682692317, 'lambda_l2': 0.29300104388528725, 'max_depth': 14, 'num_leaves': 81, 'min_data_in_leaf': 150, 'min_child_weight': 0.001616998311835574, 'min_child_samples': 2, 'max_bin': 1022, 'subsample': 0.9885898125913607, 'subsample_freq': 0}
SGD_SENTENCE_PARAMS = {'max_iter': 5132, 'loss': 'modified_huber', 'penalty': 'l2', 'alpha': 0.00029234336294386683, 'learning_rate': 'invscaling', 'eta0': 0.8611233723469085, 'early_stopping': True, 'n_iter_no_change': 100, 'warm_start': True, 'n_jobs': -1}
MULTINOMIALNB_SENTENCE_PARAMS = {"alpha": 0.02}
COMPLEMENTNB_SENTENCE_PARAMS = {'alpha': 0.004243397640210257, 'force_alpha': False, 'fit_prior': True, 'norm': False}
RIDGE_SENTENCE_PARAMS = {'max_iter': 8070, 'fit_intercept': True, 'alpha': 0.0028878876055973765}
PASSIVE_SENTENCE_PARAMS = {'C': 0.891141518158109, 'max_iter': 5052, 'fit_intercept': False, 'early_stopping': True, 'loss': 'hinge', 'warm_start': False, 'average': 6}

N_SPLIT_CATBOOST_TRAIN = 8
N_SPLIT_CATBOOST_GROUP = 2

if DEBUG:
    #N_ESTIMATORS = 10
    if IS_KAGGLE:
        !cp -r /kaggle/input/sentencepiece-refined-preprocessed/230109_on-the-fly_0.05_v2 /tmp
    N_OPTUNA_TRIALS = 2
    OPTUNA_EARLY_STOP_COUNT = 1

In [3]:
# Fast save
IS_RERUN = False
if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    IS_RERUN = True
    pass
else:
    try:
        sub = pd.read_csv(
            "/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv"
        )
        sub.to_csv("submission.csv", index=False)
    except:
        sub = pd.read_csv("../input/llm-detect-ai-generated-text/sample_submission.csv")
        sub.to_csv("submission.csv", index=False)
if (not DEBUG) and (not IS_RERUN) and (IS_KAGGLE):
    sys.exit()

In [4]:
test = pd.read_csv(f"{ROOT}/llm-detect-ai-generated-text/test_essays.csv")
sub = pd.read_csv(f"{ROOT}/llm-detect-ai-generated-text/sample_submission.csv")
org_train = pd.read_csv(f"{ROOT}/llm-detect-ai-generated-text/train_essays.csv")
org_train = org_train.sample(frac=1.0, random_state=SEED)
org_train.reset_index(drop=True, inplace=True)
train_path = (
    f"{ROOT}/llm-daigt-5fold-split-seed7-train/20230112_model-wise-split_all-data.csv"
    if IS_KAGGLE
    else f"{ROOT}/20230112_model-wise-split_all-data.csv"
)
train = pd.read_csv(train_path)
train = train.drop_duplicates(subset=["text"])
train = last_fold_changer(train, MODEL_FOR_ENSEMBLE)
train = models_excluder(train, MODELS_TO_EXCLUDE)

print(np.unique(train.loc[train.fold == N_FOLDS - 1].model))
train = train.sample(frac=1.0, random_state=SEED)
train.reset_index(drop=True, inplace=True)

['gpt' 'human']


Sample each source for vocab calculation for the tfidfvectorizer

In [5]:
list_source = np.unique(train["model"])
texts_sampled_for_vectorizer_vocab = pd.DataFrame()
texts_remains = pd.DataFrame()
for source in list_source:
    train_one_source = train.loc[train.model == source]
    n_samples = int(len(train_one_source) * VECTORIZER_VOCAB_SAMPLE_RATIO)
    remain_one_source = train_one_source.iloc[n_samples:]
    train_one_source = train_one_source.iloc[:n_samples]
    texts_remains = pd.concat([texts_remains, remain_one_source])
    texts_sampled_for_vectorizer_vocab = pd.concat(
        [texts_sampled_for_vectorizer_vocab, train_one_source]
    )
# Also add org_train
texts_sampled_for_vectorizer_vocab = pd.concat(
    [
        texts_sampled_for_vectorizer_vocab,
        org_train.iloc[: int(len(org_train))],
    ]
)

# For submission
if len(test) > 3:
    texts_sampled_for_vectorizer_vocab = pd.concat(
        [
            texts_sampled_for_vectorizer_vocab,
            test.iloc[: int(len(test))],
        ]
    )

len(texts_sampled_for_vectorizer_vocab)

1880

In [6]:
# partial_train_for_ensemble = train.loc[train["fold"] == N_FOLDS - 1]
# train = train.loc[train["fold"] != N_FOLDS - 1]
# N_FOLDS = N_FOLDS - 1

## Tokenize splitted folds and save

In [7]:
def tokenize_texts(raw_tokenizer, texts):
    tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=raw_tokenizer,
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]",
    )
    tokenized_texts = []
    for text in tqdm(texts["text"].tolist()):
        tokenized_texts.append(tokenizer.tokenize(text))
    del raw_tokenizer, tokenizer
    gc.collect()
    return tokenized_texts


def sentence_piece_bpe_tokenizer(train, org_train, sampled, remains, test):
    raw_tokenizer = SentencePieceBPETokenizer()
    raw_tokenizer.normalizer = normalizers.Sequence(
        [NORMALIZER] + [normalizers.Lowercase()] if LOWERCASE else []
    )
    raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    merged_pd = pd.concat([train, org_train])
    dataset = Dataset.from_pandas(merged_pd[["text"]])

    def train_corp_iter():
        for i in range(0, len(dataset), 300):
            yield dataset[i : i + 300]["text"]

    raw_tokenizer.train_from_iterator(train_corp_iter())
    del dataset
    gc.collect()

    tokenized_texts_train = tokenize_texts(raw_tokenizer, train)
    tokenized_texts_org_train = tokenize_texts(raw_tokenizer, org_train)
    tokenized_texts_sampled = tokenize_texts(raw_tokenizer, sampled)
    tokenized_texts_remains = tokenize_texts(raw_tokenizer, remains)
    tokenized_texts_test = tokenize_texts(raw_tokenizer, test)
    return (
        tokenized_texts_train,
        tokenized_texts_org_train,
        tokenized_texts_sampled,
        tokenized_texts_remains,
        tokenized_texts_test,
    )


def bpe_tokenizer(train, org_train, sampled, remains, test):
    raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
    raw_tokenizer.normalizer = normalizers.Sequence(
        [NORMALIZER] + [normalizers.Lowercase()] if LOWERCASE else []
    )
    raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
    trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)
    merged_pd = pd.concat([train, org_train])
    dataset = Dataset.from_pandas(merged_pd[["text"]])

    def train_corp_iter():
        for i in range(0, len(dataset), 1000):
            yield dataset[i : i + 1000]["text"]

    raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
    del dataset
    gc.collect()

    tokenized_texts_train = tokenize_texts(raw_tokenizer, train)
    tokenized_texts_org_train = tokenize_texts(raw_tokenizer, org_train)
    tokenized_texts_sampled = tokenize_texts(raw_tokenizer, sampled)
    tokenized_texts_remains = tokenize_texts(raw_tokenizer, remains)
    tokenized_texts_test = tokenize_texts(raw_tokenizer, test)
    return (
        tokenized_texts_train,
        tokenized_texts_org_train,
        tokenized_texts_sampled,
        tokenized_texts_remains,
        tokenized_texts_test,
    )

In [8]:
def dummy(text):
    return text


def vectorizer_fit_sampled_vectorize_all(
    train_tokens, org_train_tokens, sampled_tokens, remain_tokens, test_tokens
):
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        lowercase=False,
        sublinear_tf=True,
        analyzer="word",
        tokenizer=dummy,
        preprocessor=dummy,
        token_pattern=None,
        strip_accents="unicode",
        binary=True,
    )
    vectorizer.fit(sampled_tokens)
    vocab = vectorizer.vocabulary_

    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        lowercase=False,
        sublinear_tf=True,
        vocabulary=vocab,
        analyzer="word",
        tokenizer=dummy,
        preprocessor=dummy,
        token_pattern=None,
        strip_accents="unicode",
        binary=True,
    )
    vectorizer.fit(remain_tokens)
    tf_train = vectorizer.transform(train_tokens)
    tf_org_train = vectorizer.transform(org_train_tokens)
    tf_test = vectorizer.transform(test_tokens)
    del vectorizer
    gc.collect()
    return (tf_train, tf_org_train, tf_test)

In [9]:
def tokenizer_with_vectorizer(train, org_train, sampled, remains, test, option):
    train_y = train["generated"].values
    org_train_y = org_train["generated"].values

    if option == "sentence":
        (
            tokenized_texts_train,
            tokenized_texts_org_train,
            tokenized_texts_sampled,
            tokenized_texts_remains,
            tokenized_texts_test,
        ) = sentence_piece_bpe_tokenizer(train, org_train, sampled, remains, test)
    elif option == "bpe":
        (
            tokenized_texts_train,
            tokenized_texts_org_train,
            tokenized_texts_sampled,
            tokenized_texts_remains,
            tokenized_texts_test,
        ) = bpe_tokenizer(train, org_train, sampled, remains, test)

    tf_train, tf_org_train, tf_test = vectorizer_fit_sampled_vectorize_all(
        tokenized_texts_train,
        tokenized_texts_org_train,
        tokenized_texts_sampled,
        tokenized_texts_remains,
        tokenized_texts_test,
    )

    save_base = f"{PROCESSED_PATH}/{option}_seed{SEED}_"
    print(tf_train.shape)
    save_pickle(f"{save_base}train.pkl", [tf_train, train_y, train["fold"].values])
    save_pickle(f"{save_base}org_train.pkl", [tf_org_train, org_train_y])
    save_pickle(f"{save_base}test.pkl", tf_test)

    del (
        tokenized_texts_train,
        tokenized_texts_org_train,
        tokenized_texts_sampled,
        tokenized_texts_remains,
        tokenized_texts_test,
        tf_train,
        tf_org_train,
    )
    gc.collect()


if not DEBUG:
    sampled = texts_sampled_for_vectorizer_vocab
    remains = texts_remains
    tokenizer_with_vectorizer(train, org_train, sampled, remains, test, "sentence")
    # tokenizer_with_vectorizer(train, org_train, sampled, remains, test, "bpe")
    del train, org_train, sampled, remains, test, texts_sampled_for_vectorizer_vocab
    gc.collect()






100%|██████████| 50481/50481 [00:24<00:00, 2034.45it/s]
100%|██████████| 1378/1378 [00:00<00:00, 1571.46it/s]
100%|██████████| 1880/1880 [00:01<00:00, 1685.89it/s]
100%|██████████| 49979/49979 [00:25<00:00, 1970.40it/s]
100%|██████████| 3/3 [00:00<00:00, 18641.35it/s]


(50481, 245614)


# Fit & inference

## Optuna fit

In [10]:
def optuna_optimization(
    opt_mode,
    model,
    n_folds,
    data_path,
    input_type,
    first_trial_param,
    seed,
    early_count,
):
    early = OptunaEarlyStoppingCallback(early_stop_count=early_count)

    sampler = optuna.samplers.TPESampler(seed=seed)
    study = optuna.create_study(
        direction="maximize", study_name="Classifier", sampler=sampler
    )
    if first_trial_param is not None:
        study.enqueue_trial(first_trial_param)

    def select_model_objectives(
        trial,
        opt_mode=opt_mode,
        model=model,
        n_folds=n_folds,
        data_path=data_path,
        input_type=input_type,
        seed=seed,
    ):
        params = select_param_type(model)

        return model_objectives(
            trial,
            N_ESTIMATORS,
            opt_mode,
            model,
            n_folds,
            data_path,
            input_type,
            seed,
            params,
            USE_PREV_ERROR,
            DEBUG,
        )

    study.optimize(
        select_model_objectives,
        n_trials=N_OPTUNA_TRIALS,
        show_progress_bar=True,
        callbacks=[early],
    )
    print(f"Best auc: {study.best_value}")
    return study


def optuna_ensemble_optimization(seed, early_count, pred_for_ensembles, y_for_ensemble):
    early = OptunaEarlyStoppingCallback(early_stop_count=early_count)

    sampler = optuna.samplers.TPESampler(seed=seed)
    study = optuna.create_study(
        direction="maximize", study_name="Classifier", sampler=sampler
    )

    def ensemble_params_sweep_range(trial, n_preds=len(pred_for_ensembles)):
        params = {}
        for i in range(0, n_preds):
            params[f"weight_{i}"] = (trial.suggest_float(f"weight_{i}", -1.0, 1.0),)
        return params

    def ensemble_objective(
        trial,
        param_range=ensemble_params_sweep_range,
        pred_for_ensembles=pred_for_ensembles,
        y_for_ensemble=y_for_ensemble,
    ):
        params = param_range(trial)
        weights = []
        for i in range(0, len(pred_for_ensembles)):
            weights.append(params[f"weight_{i}"])
        pred = np.clip(weighted_average_preds(pred_for_ensembles, weights), 0, 1)
        return roc_auc_score(y_for_ensemble, pred)

    study.optimize(
        ensemble_objective,
        n_trials=N_OPTUNA_ENSEMBLE_TRIALS,
        show_progress_bar=False,
        callbacks=[early],
    )
    print(f"Best auc: {study.best_value}")
    return study

SGD_SENTENCE_PARAMS["max_iter"] = 100 if DEBUG else SGD_SENTENCE_PARAMS["max_iter"]

optuna_opt_args = {
    "opt_mode": "coarse",
    "model": "LGBM",
    "n_folds": N_FOLDS,
    "data_path": PROCESSED_PATH,
    "input_type": "sentence",
    "first_trial_param": None,
    "seed": SEED,
    "early_count": OPTUNA_EARLY_STOP_COUNT,
}

optuna_opt_args["first_trial_param"] = LGBM_SENTENCE_PARAMS
lgbm_study = optuna_optimization(**optuna_opt_args)
LGBM_SENTENCE_PARAMS = lgbm_study.best_params
del lgbm_study
gc.collect()

optuna_opt_args["first_trial_param"] = SGD_SENTENCE_PARAMS
optuna_opt_args["model"] = "SGD"
sgd_study = optuna_optimization(**optuna_opt_args)
SGD_SENTENCE_PARAMS = sgd_study.best_params
del sgd_study
gc.collect()

optuna_opt_args["first_trial_param"] = MULTINOMIALNB_SENTENCE_PARAMS
optuna_opt_args["model"] = "multinomialNB"
multinomialnb_study = optuna_optimization(**optuna_opt_args)
MULTINOMIALNB_SENTENCE_PARAMS = multinomialnb_study.best_params
del multinomialnb_study
gc.collect()

optuna_opt_args["first_trial_param"] = COMPLEMENTNB_SENTENCE_PARAMS
optuna_opt_args["model"] = "complementNB"
sgd_study = optuna_optimization(**optuna_opt_args)
COMPLEMENTNB_SENTENCE_PARAMS = sgd_study.best_params
del sgd_study
gc.collect()

optuna_opt_args["first_trial_param"] = RIDGE_SENTENCE_PARAMS
optuna_opt_args["model"] = "ridge"
multinomialnb_study = optuna_optimization(**optuna_opt_args)
RIDGE_SENTENCE_PARAMS = multinomialnb_study.best_params
del multinomialnb_study
gc.collect()

optuna_opt_args["first_trial_param"] = PASSIVE_SENTENCE_PARAMS
optuna_opt_args["model"] = "passiveAggresive"
multinomialnb_study = optuna_optimization(**optuna_opt_args)
PASSIVE_SENTENCE_PARAMS = multinomialnb_study.best_params
del multinomialnb_study
gc.collect()

## Inference

Don't forget to clip ensemble

In [11]:
def pred_and_append_it(
    model_obj_args, model_name, model_params, pred_tests, pred_for_ensembles
):
    model_obj_args["model_name"] = model_name
    model_obj_args["model_params"] = model_params
    pred_test, pred_for_ensemble, y_for_ensemble = model_objectives(**model_obj_args)
    pred_tests = append_preds(pred_tests, pred_test)
    pred_for_ensembles = append_preds(pred_for_ensembles, pred_for_ensemble)
    return pred_tests, pred_for_ensembles, y_for_ensemble

In [12]:
fit_method = "infer_one_fold" if DEBUG else "infer_all_folds"
# SGD_SENTENCE_PARAMS["max_iter"] = 100 if DEBUG else SGD_SENTENCE_PARAMS["max_iter"]
fit_method = "infer_one_fold"

model_obj_args = {
    "trial": None,
    "n_estimators": N_ESTIMATORS,
    "opt_mode": fit_method,
    "model_name": "LGBM",
    "n_folds": N_FOLDS,
    "data_path": PROCESSED_PATH,
    "input_type": "sentence",
    "seed": SEED,
    "model_params": LGBM_SENTENCE_PARAMS,
    "use_prev_error": USE_PREV_ERROR,
    "debug": DEBUG,
}

pred_tests = []
pred_for_ensembles = []


pred_tests, pred_for_ensembles, y_for_ensemble = pred_and_append_it(
    model_obj_args, "LGBM", LGBM_SENTENCE_PARAMS, pred_tests, pred_for_ensembles
)
pred_tests, pred_for_ensembles, y_for_ensemble = pred_and_append_it(
    model_obj_args, "SGD", SGD_SENTENCE_PARAMS, pred_tests, pred_for_ensembles
)
pred_tests, pred_for_ensembles, y_for_ensemble = pred_and_append_it(
    model_obj_args,
    "multinomialNB",
    MULTINOMIALNB_SENTENCE_PARAMS,
    pred_tests,
    pred_for_ensembles,
)
pred_tests, pred_for_ensembles, y_for_ensemble = pred_and_append_it(
    model_obj_args,
    "complementNB",
    COMPLEMENTNB_SENTENCE_PARAMS,
    pred_tests,
    pred_for_ensembles,
)
pred_tests, pred_for_ensembles, y_for_ensemble = pred_and_append_it(
    model_obj_args, "ridge", RIDGE_SENTENCE_PARAMS, pred_tests, pred_for_ensembles
)
pred_tests, pred_for_ensembles, y_for_ensemble = pred_and_append_it(
    model_obj_args,
    "passiveAggresive",
    PASSIVE_SENTENCE_PARAMS,
    pred_tests,
    pred_for_ensembles,
)





## Ensemble param optimization
Optuna optimization of the weights of the preds

In [13]:
optuna.logging.disable_default_handler()
ensemble_study = optuna_ensemble_optimization(
    SEED,
    OPTUNA_ENSEMBLE_EARLY_STOP_COUNT,
    pred_for_ensembles,
    y_for_ensemble,
)
print(ensemble_study.best_params)
model_weights = []
for k in ensemble_study.best_params.keys():
    model_weights.append(ensemble_study.best_params[k])

sentence_pred = weighted_average_preds(pred_tests, model_weights)

Optuna study early stopped by early_stop_count
Best auc: 0.9992349274473428
{'weight_0': -0.11483735183271758, 'weight_1': -0.8917005018676765, 'weight_2': 0.32939724923805924, 'weight_3': -0.32093536935194145, 'weight_4': -0.09556739060888797, 'weight_5': -0.3984398774889969}


In [14]:
final_pred = sentence_pred
sub["generated"] = final_pred
sub.to_csv("submission.csv", index=False)
sub

Unnamed: 0,id,generated
0,0000aaaa,0.257108
1,1111bbbb,0.256556
2,2222cccc,0.269426
