In [9]:
import sys
import gc
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier, early_stopping
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

In [2]:
test = pd.read_csv("fake_test_essays.csv")
sub = pd.read_csv("sample_submission.csv")
org_train = pd.read_csv("train_essays.csv")
train = pd.read_csv("train_v2_drcat_02.csv", sep=",")

In [3]:
class CFG:
    IS_TRAIN_ON_FULL = True
    HALF_TRAIN_SAMPLE = 22500
    RANDOM_STATE = 42
    LOWER_CASE = False
    VOCAB_SIZE = 30522

In [4]:
if CFG.IS_TRAIN_ON_FULL:
    print("-----Using full training data-----")
    train = train.drop_duplicates(subset=["text"])
    train = train.sample(len(train))
    print("The shape of training dataset is:", train.shape)
    train.reset_index(drop=True, inplace=True)
    print(train.head())
else:
    print("-----Using partial training data-----")
    train = train.drop_duplicates(subset=["text"])
    train_label_0 = train[train["label"] == 0]
    train_label_1 = train[train["label"] == 1]
    train_label_0 = train_label_0.sample(
        CFG.HALF_TRAIN_SAMPLE, random_state=CFG.RANDOM_STATE
    )
    train_label_1 = train_label_1.sample(
        CFG.HALF_TRAIN_SAMPLE, random_state=CFG.RANDOM_STATE
    )
    train = pd.concat([train_label_0, train_label_1])
    train = train.sample(len(train))
    print("The shape of training dataset is:", train.shape)
    train.reset_index(drop=True, inplace=True)
    print(train.head())

-----Using full training data-----
The shape of training dataset is: (44868, 5)
                                                text  label  \
0  Have you ever woken up in the morning and wish...      0   
1  In the article "Making Mona Lisa Smile," the a...      0   
2  The proliferation of high school programs that...      1   
3  Dear Principle,\n\nIn my opinion I choose poli...      0   
4  Participating in New Adventures  \n\nThere are...      1   

                     prompt_name             source  RDizzl3_seven  
0              Distance learning    persuade_corpus          False  
1    Facial action coding system    persuade_corpus           True  
2              Distance learning      chat_gpt_moth          False  
3          Cell phones at school    persuade_corpus          False  
4  "A Cowboy Who Rode the Waves"  darragh_claude_v6           True  


### 使用bpe_trainer

In [5]:
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFC()] + [normalizers.Lowercase()] if CFG.LOWER_CASE else []
)
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(
    vocab_size=CFG.VOCAB_SIZE,
    special_tokens=special_tokens
)

dataset = Dataset.from_pandas(test[["text"]])


def train_corpus():
    for i in tqdm(range(0, len(dataset), 100)):
        yield dataset[i:i + 100]["text"]


raw_tokenizer.train_from_iterator(train_corpus(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test = []
for text in tqdm(test["text"].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []
for text in tqdm(train["text"].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/44868 [00:00<?, ?it/s]

In [6]:
def dummy(text):
    return text


vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

vectorizer.fit(tokenized_texts_test)
vocab = vectorizer.vocabulary_
print(len(vocab))

vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    vocabulary=vocab,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

X_train = vectorizer.fit_transform(tokenized_texts_train)
y_train = train["label"].values
X_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

2590


21

In [7]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42
)

In [13]:
if len(test["text"].values) <= 5:
    sub.to_csv("submission.csv", index=False)

else:
    clf = MultinomialNB(alpha=0.02)

    sgd_model = SGDClassifier(
        max_iter=8000,
        tol=1e-4,
        loss="modified_huber",
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=100
    )

    lgbm_params = {
        "n_iter": 4000,
        "verbose": 1,
        "objective": "cross_entropy",
        "metric": "auc",
        "learning_rate": 0.0015
    }
    lgb = LGBMClassifier(**lgbm_params)
    lgb.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[early_stopping(50)]
    )

    cat = CatBoostClassifier(
        iterations=4000,
        verbose=0,
        subsample=0.4,
        allow_const_label=True,
        loss_function="CrossEntropy"
    )
    cat.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=100
    )

    weights = [0.1, 0.3, 0.3, 0.3]
    ensemble = VotingClassifier(
        estimators=[
            ("mnb", clf),
            ("sgd", sgd_model),
            ("lgb", lgb),
            ("cat", cat)
        ],
        weights=weights,
        voting="soft",
        n_jobs=-1
    )

    ensemble.fit(X_train, y_train)

    gc.collect()
    final_preds = ensemble.predict_proba(X_test)[:, 1]
    print(final_preds)
    sub["generated"] = final_preds
    sub.to_csv("submission.csv", index=False)
    sub



[LightGBM] [Info] [cross_entropy:Init]: (objective) labels passed interval [0, 1] check
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001465 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 100
[LightGBM] [Info] Number of data points in the train set: 40381, number of used features: 18
[LightGBM] [Info] [cross_entropy:BoostFromScore]: pavg = 0.390109 -> initscore = -0.446853
[LightGBM] [Info] Start training from score -0.446853
Training until validation scores don't improve for 1 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.509946
[0.64029319 0.62358555 0.63206514 0.65514206 0.63335561 0.62087081
 0.65233232 0.6480083  0.62852417 0.6466322  0.64972795 0.6374582
 0.61236051 0.62739167 0.62761491 0.65181104 0.62887691 0.62380708
 0.65513836 0.62633624 0.61970155 0.66443236 0.62929424 0.60579512
 0.61167191 0.61925762 0.

ValueError: Length of values (100) does not match length of index (3)