In [1]:
import numpy as np
import pandas as pd
import pickle
import random
import copy
import gc

from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

from transformers import PreTrainedTokenizerFast

from datasets import Dataset
from tqdm.auto import tqdm

import transformers
import os
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from transformers import AutoTokenizer

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier, ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier



In [2]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
submission = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
origin = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
train = pd.read_csv('/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv', sep=',')

# train = train[train["RDizzl3_seven"] == True] 

train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

In [3]:
x_num = len(train)
x = random.sample(range(len(train)), x_num)

tmp = pd.DataFrame([train.iloc[i] for i in x])
train = tmp
train.reset_index(drop=True, inplace=True)

In [4]:
data_df = pd.concat([test.copy(), train.copy()])
data_df = data_df.drop_duplicates(subset=['text'])
data_df = data_df.reset_index(drop=True)

class cfg:
    LOWERCASE = False
    VOCAB_SIZE = 300000 # 30522  # len(tokenizer)

def train_iter(dataset):
    for i in range(0, len(dataset), 1000):
        yield dataset[i:i+1000]['text']
        
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]  # "[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"

dataset = Dataset.from_pandas(data_df[['text']])  # test[["text"]]

# with open("/kaggle/input/daigt-vocav/tokenized_vocab_all.pkl", "rb") as tf:
#     vocab_t0 = pickle.load(tf)

In [5]:
raw_tokenizer = Tokenizer(
    models.BPE(unk_token = "[UNK]")  # vocab = vocab_t0, merges = [],
)

raw_tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFC()] + [normalizers.Lowercase()] if cfg.LOWERCASE else []
)

raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

trainer = trainers.BpeTrainer(vocab_size=cfg.VOCAB_SIZE, special_tokens=special_tokens)

raw_tokenizer.train_from_iterator(train_iter(dataset), trainer=trainer)

tokenizer = PreTrainedTokenizerFast(  # 事前学習済みのtokenizer使った方がいいかも
    tokenizer_object = raw_tokenizer,
    unk_token = "[UNK]",
    pad_token = "[PAD]",
    cls_token = "[CLS]",
    sep_token = "[SEP]",
    mask_token = "[MASK]",
)






In [6]:
tokenized_test = []
tokenized_train = []

for text in tqdm(test['text'].tolist()):
    tokenized_test.append(tokenizer.tokenize(text))

for text in tqdm(train['text'].tolist()):
    tokenized_train.append(tokenizer.tokenize(text))

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/44868 [00:00<?, ?it/s]

In [7]:
def dummy(text):
    return text
vectorizer = TfidfVectorizer(  # TfidfVectorizer
    ngram_range = (3,5),
    lowercase = cfg.LOWERCASE,
    sublinear_tf = True,
    analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None,
    strip_accents = 'unicode'
)

vectorizer.fit(tokenized_test)  # tokenized_test

vocab = vectorizer.vocabulary_

#print(len(vocab))

# fit train by using test-vocab
vectorizer = TfidfVectorizer(
    ngram_range = (3,5),
    lowercase = cfg.LOWERCASE,
    sublinear_tf = True,
    vocabulary = vocab,
    analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None,
    strip_accents = 'unicode'
)

tf_train = vectorizer.fit_transform(tokenized_train)
tf_test = vectorizer.transform(tokenized_test)

In [8]:
use_GPC = False

### GPC-Training
if not use_GPC or len(test.text.values) <= 5:
    pass

else:
    GPC = GaussianProcessClassifier()
    batch = 2000

    svd_vectorizer = Pipeline(steps=[
        ("TfidfVectorizer", vectorizer),
        ("TruncatedSVD", TruncatedSVD(n_components=100, n_iter=7, random_state=42))
    ])
    
    skf = StratifiedKFold(
        n_splits = 5,
        shuffle = False
    )

    svd_list_tr = []
    svd_list_te = []

    for bs in tqdm(range(0, len(train), batch)):
        svd_train_bs = svd_vectorizer.fit_transform(tokenized_train[bs:bs+batch])
        svd_list_tr.append(svd_train_bs)
        
    svd_train = np.concatenate([svd_list_tr[i] for i in range(len(svd_list_tr))])
    goof_train = np.zeros(svd_train.shape[0])
    
    del svd_train_bs, svd_list_tr
    gc.collect()
    
    for bs in tqdm(range(0, len(test), batch)):
        svd_test_bs = svd_vectorizer.transform(tokenized_test[bs:bs+batch])
        svd_list_te.append(svd_test_bs)
        
    svd_test = np.concatenate([svd_list_te[i] for i in range(len(svd_list_te))])
    goof_test = np.zeros(svd_test.shape[0])
    goof_test_skf = np.zeros((5, svd_test.shape[0]))
    
    del svd_test_bs, svd_list_te
    gc.collect()
    
    y = train['label'].values

    for i, (tr_idx, val_idx) in enumerate(skf.split(svd_train, y)):
            
        X_train, X_valid = svd_train[tr_idx], svd_train[val_idx]
        y_train, y_valid = y[tr_idx], y[val_idx]
            
        gpc_cv = copy.deepcopy(GPC)
        
        for bs in tqdm(range(0, len(X_train), batch)):
            gpc_cv.fit(X_train[bs:batch+bs], y_train[bs:batch+bs])
            
        goof_train[val_idx] = gpc_cv.predict_proba(X_valid)[:,1]
        goof_test_skf[i, :] = gpc_cv.predict_proba(svd_test)[:,1]

    goof_test[:] = goof_test_skf.mean(axis=0)
    
    del svd_vectorizer, svd_train, svd_test, GPC
    gc.collect()

In [9]:
del vectorizer
gc.collect()

21

In [10]:
model_checkpoint = "/kaggle/input/detect-llm-models/distilroberta-finetuned_v5/checkpoint-13542"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
    return tokenizer(examples['text'], max_length = 512 , padding=True, truncation=True)
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Move your model and data to the GPU
model.to(device);
trainer = Trainer(
    model,
    tokenizer=tokenizer,
)

## Stacking用のLLM Train
train_ds = Dataset.from_pandas(train[['text']])
train_ds_enc = train_ds.map(preprocess_function, batched=True)
train_preds = trainer.predict(train_ds_enc)
logits_tr = train_preds.predictions
probs_tr = (np.exp(logits_tr) / np.sum(np.exp(logits_tr), axis=-1, keepdims=True))[:,0]


#test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test_ds = Dataset.from_pandas(test)
test_ds_enc = test_ds.map(preprocess_function, batched=True)
test_preds = trainer.predict(test_ds_enc)
logits = test_preds.predictions
probs = (np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True))[:,0]

  0%|          | 0/45 [00:00<?, ?ba/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
bayes_model = MultinomialNB(alpha=0.02)  # + GaussianNB + CategoricalNB

SGD_model = SGDClassifier(
    max_iter = 8000,
    tol = 1e-4,
    loss = "modified_huber"
)

kNN_model = KNeighborsClassifier(
    n_neighbors = 10,
    metric = 'cosine'
)

p6 = {'n_iter': 2500,'verbose': -1,'objective': 'cross_entropy','metric': 'auc',
    'learning_rate': 0.00581909898961407, 'colsample_bytree': 0.78,
    'colsample_bynode': 0.8, 'lambda_l1': 4.562963348932286, 
    'lambda_l2': 2.97485, 'min_data_in_leaf': 115, 'max_depth': 23, 'max_bin': 898}
LGBM_model = LGBMClassifier(**p6)  # 20min

Cat_model = CatBoostClassifier(iterations=2000,
                           verbose=0,
                           l2_leaf_reg=6.6591278779517808,
                           learning_rate=0.005599066836106983,
                           subsample = 0.4,
                           allow_const_label=True,
                           loss_function = 'CrossEntropy')  # 6h

ETR_model = ExtraTreesClassifier(
    n_estimators=100,
    criterion='gini'
)

RF_model = RandomForestClassifier(criterion='entropy')

GPC_model = GaussianProcessClassifier()

LR_model = LogisticRegression(
    penalty = "elasticnet",
    solver = "saga",
    max_iter = 500,
    l1_ratio = 0.5
)

XGB_model = XGBClassifier(
    objective = 'binary:logistic', 
    eval_metric = 'auc',
    eta = 0.01,
)

In [12]:
if len(test.text.values) <= 5:
    submission.to_csv("submission.csv", index=False)

else:
    ### 1.Predict estimators
    estimators = [
        SGD_model,
        bayes_model,
        LGBM_model,
        #Cat_model,
        #ETR_model,
        RF_model,
    ]

    skf = StratifiedKFold(
        n_splits = 5,
        shuffle = False
    )

    y = train['label'].values
    fin_train = []
    fin_test = []

    for est in estimators:

        oof_train = np.zeros(tf_train.shape[0])
        oof_test = np.zeros(tf_test.shape[0])
        oof_test_skf = np.zeros((5, tf_test.shape[0]))

        for i, (tr_idx, val_idx) in enumerate(skf.split(tf_train, y)):
            print(f'[CV : {est}] {i+1}/{5}')
            X_train, X_valid = tf_train[tr_idx], tf_train[val_idx]
            y_train, y_valid = y[tr_idx], y[val_idx]
            
            est_cv = copy.deepcopy(est)
            est_cv.fit(X_train, y_train)

            oof_train[val_idx] = est_cv.predict_proba(X_valid)[:,1]
            oof_test_skf[i, :] = est_cv.predict_proba(tf_test)[:,1]

        oof_test[:] = oof_test_skf.mean(axis=0)
        fin_train.append(oof_train)
        fin_test.append(oof_test)
        
    if use_GPC:
        fin_train.append(goof_train)
        fin_test.append(goof_test)
    fin_train.append(probs_tr)
    fin_test.append(probs)
    
    final_train = np.stack([fin_train[i] for i in range(len(fin_train))], axis=1)
    final_test = np.stack([fin_test[i] for i in range(len(fin_test))], axis=1)


    ### 2.Stacking
    #Stack_model = LGBM_model # XGBClassifier(**fin)
    Stack_model = VotingClassifier(
        estimators = [
            ("lgb", LGBM_model),
            ("lr", LR_model),
        ],
        weights = [0.7, 0.3],
        voting = 'soft',
        n_jobs = -1
    )

    Stack_model.fit(final_train, train['label'].values)

    final_preds = Stack_model.predict_proba(final_test)[:,1]

    submission['generated'] = final_preds.astype(np.float16)
    submission.to_csv("submission.csv", index=False)

    submission