In [1]:
import sys
import os
import gc

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction.text import TfidfVectorizer
import optuna

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

from optuna.integration.wandb import WeightsAndBiasesCallback
import wandb
import optuna

sys.path.append(f"{os.getcwd()[:-10]}/main")
from utils import load_pickle, OptunaEarlyStoppingCallback
from modules import model_objectives
from params import select_param_type

wandb.login(key="f2d4d498ee5f11b3e3503afd7f00f4cf52589e2e")

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjunseonglee[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/junseonglee/.netrc


True

# Parameters

In [2]:
N_FOLDS = 11
INPUT_TYPE = "sentence"  # sentence, bpe
MODEL = "LGBM"  # LGBM, XGB, CatBoost
ROOT = "../input"
SEED = 7
LOWERCASE = False
VOCAB_SIZE = 30522
PROCESSED_PATH = f"{ROOT}/230113_model-wise-split_all-data"
N_ESTIMATORS = 2000
N_OPTUNA_TRIALS = 3000
OPTUNA_EARLY_STOP_COUNT = 50
configs = {
    "model": MODEL,
    "N_FOLDS": N_FOLDS,
    "INPUT_TYPE": INPUT_TYPE,
    "N_OPTUNA_ITERATIONs": N_OPTUNA_TRIALS,
    "OPTUNA_EARLY_STOP_COUNT": OPTUNA_EARLY_STOP_COUNT,
}

In [3]:
def select_model_objectives(
    trial,
    opt_mode="coarse",
    model=MODEL,
    n_folds=N_FOLDS,
    data_path=PROCESSED_PATH,
    input_type=INPUT_TYPE,
    seed=SEED,
):
    params = select_param_type(model)

    return model_objectives(
        trial,
        N_ESTIMATORS,
        opt_mode,
        model,
        n_folds,
        data_path,
        input_type,
        seed,
        params,
    )


def select_model_objectives_coarse(trial):
    return select_model_objectives(trial, opt_mode="coarse")


def select_model_objectives_finetuning(trial):
    return select_model_objectives(trial, opt_mode="finetune")

In [4]:
def optuna_optimization(opt_mode="coarse", first_trial_param=None):
    wandb_kwargs = {
        "project": "Detect AI-Generated Text",
        "group": f"{MODEL}_params",
        "name": f"default_{INPUT_TYPE}_{opt_mode}",
        "config": configs,
    }
    wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)
    early = OptunaEarlyStoppingCallback(early_stop_count=OPTUNA_EARLY_STOP_COUNT)

    study = optuna.create_study(direction="maximize", study_name="Classifier")
    if first_trial_param is not None:
        study.enqueue_trial(first_trial_param)
    study.optimize(
        select_model_objectives_coarse
        if opt_mode == "coarse"
        else select_model_objectives_finetuning,
        n_trials=N_OPTUNA_TRIALS,
        show_progress_bar=True,
        callbacks=[wandbc, early],
    )
    wandb.finish()
    return study

In [None]:
# Coarse optimization with only 1 fold
# coarse_study = optuna_optimization("coarse")
# Finetuning optimization with all N_FOLDS
finetune_study = optuna_optimization("finetune")

In [14]:
tf_train, y_train = load_pickle(
    f"{PROCESSED_PATH}/{INPUT_TYPE}_seed{SEED}_fold{0}_train.pkl"
)

In [19]:
y_train.

(33598,)

In [20]:
sampler = np.arange(0, len(y_train))
np.random.seed(0)
np.random.shuffle(sampler)
sampler = sampler[: int(len(sampler) * 0.8)]
tf_train[sampler]
np.array(y_train)[sampler]

array([1, 0, 0, ..., 0, 0, 0])

In [16]:
tf_train[[0, 4, 56]]

<3x6487706 sparse matrix of type '<class 'numpy.float64'>'
	with 837 stored elements in Compressed Sparse Row format>

In [17]:
len(y_train)

33598