In [1]:
import sys
import gc
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier, early_stopping
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import VotingClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
test = pd.read_csv("fake_test_essays.csv")
sub = pd.read_csv("sample_submission.csv")
org_train = pd.read_csv("train_essays.csv")
train = pd.read_csv("train_v2_drcat_02.csv", sep=",")
train = train.dropna()
print(train.shape)

(44868, 5)


In [3]:
class CFG:
    IS_TRAIN_ON_FULL = True
    HALF_TRAIN_SAMPLE = 22500
    RANDOM_STATE = 42
    LOWER_CASE = False
    VOCAB_SIZE = 32000

In [4]:
if CFG.IS_TRAIN_ON_FULL:
    print("-----Using full training data-----")
    train = train.drop_duplicates(subset=["text"])
    train = train.sample(len(train))
    print("The shape of training dataset is:", train.shape)
    train.reset_index(drop=True, inplace=True)
    print(train.head())
else:
    print("-----Using partial training data-----")
    train = train.drop_duplicates(subset=["text"])
    train_label_0 = train[train["label"] == 0]
    train_label_1 = train[train["label"] == 1]
    train_label_0 = train_label_0.sample(
        CFG.HALF_TRAIN_SAMPLE, random_state=CFG.RANDOM_STATE
    )
    train_label_1 = train_label_1.sample(
        CFG.HALF_TRAIN_SAMPLE, random_state=CFG.RANDOM_STATE
    )
    train = pd.concat([train_label_0, train_label_1])
    train = train.sample(len(train))
    print("The shape of training dataset is:", train.shape)
    train.reset_index(drop=True, inplace=True)
    print(train.head())

-----Using full training data-----
The shape of training dataset is: (44868, 5)
                                                text  label  \
0  My argument about Lukes point of view is that ...      0   
1  Dear Principal,\n\nI'm writing to express my t...      1   
2  Dear principle.\n\nI strongly think that you s...      0   
3  Winston Churchill is a well-known figure in hi...      1   
4  With the growing popularity of social media pl...      1   

                             prompt_name                source  RDizzl3_seven  
0          "A Cowboy Who Rode the Waves"       persuade_corpus           True  
1  Grades for extracurricular activities           llama2_chat          False  
2                  Cell phones at school       persuade_corpus          False  
3              Seeking multiple opinions  mistral7binstruct_v1          False  
4                     Phones and driving        falcon_180b_v1          False  


### 使用bpe_trainer

In [5]:
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFC()] + [normalizers.Lowercase()] if CFG.LOWER_CASE else []
)
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(
    vocab_size=CFG.VOCAB_SIZE,
    special_tokens=special_tokens
)

dataset = Dataset.from_pandas(test[["text"]])


def train_corpus():
    for i in tqdm(range(0, len(dataset), 100)):
        yield dataset[i:i + 100]["text"]


raw_tokenizer.train_from_iterator(train_corpus(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test = []
for text in tqdm(test["text"].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []
for text in tqdm(train["text"].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/44868 [00:00<?, ?it/s]

### 训练第一种vectorizer，其中min_df=2

In [6]:
def dummy(text):
    return text


vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode",
    min_df=2
)

vectorizer.fit(tokenized_texts_test)
vocab = vectorizer.vocabulary_
print(len(vocab))

vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    vocabulary=vocab,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

X_train = vectorizer.fit_transform(tokenized_texts_train)
# print(sys.getsizeof(X_train) / 1024 ** 2, "MB")
# X_train = X_train.astype("float16")
# print(sys.getsizeof(X_train) / 1024 ** 2, "MB")
y_train = train["label"].values
# y_train = y_train.astype("float16")
X_test = vectorizer.transform(tokenized_texts_test)
# X_test = X_test.astype("float16")
print("The shape of X_train is:", X_train.shape)
print("The shape of y_train is:", y_train.shape)
print("The shape of X_test is:", X_test.shape)

del vectorizer
gc.collect()

# num_features = X_train.shape[1]

# # 使用卡方检验选择特征
# k = int(num_features / 4)
# chi2_selector = SelectKBest(chi2, k=k)
# X_train_chi2_selected = chi2_selector.fit_transform(X_train, y_train)
# print(sys.getsizeof(X_train_chi2_selected) / 1024 ** 2, "MB")
# X_test_chi2_selected = chi2_selector.transform(X_test)

# X_train = X_train_chi2_selected
# X_test = X_test_chi2_selected
# print("The shape of X_train is:", X_train.shape)
# print("The shape of y_train is:", y_train.shape)
# print("The shape of X_test is:", X_test.shape)

# # 使用SVD进行降维
# n_components = int(num_features / 4)
# svd = TruncatedSVD(n_components=n_components)
# X_train_svd = svd.fit_transform(X_train)
# X_test_svd = svd.transform(X_test)

# X_train = hstack([X_train_chi2_selected, X_train_svd])
# X_train = X_train.toarray()
# X_test = hstack([X_test_chi2_selected, X_test_svd])
# X_test = X_test.toarray()
# print("The shape of X_train is:", X_train.shape)
# print("The shape of y_train is:", y_train.shape)
# print("The shape of X_test is:", X_test.shape)

1300
The shape of X_train is: (44868, 1300)
The shape of y_train is: (44868,)
The shape of X_test is: (100, 1300)


21

In [7]:
if len(test.text.values) <= 2:
    sub.to_csv("submission.csv", index=False)
else:
    mnb = MultinomialNB(alpha=0.0225)

    # gnb = GaussianNB()

    sgd_model = SGDClassifier(max_iter=9000, tol=1e-4, loss="modified_huber")

    weights = [0.10, 0.31]

    ensemble = VotingClassifier(
        estimators=[
            ("mnb", mnb),
            # ("gnb", gnb),
            ("sgd", sgd_model)
        ],
        weights=weights,
        voting="soft",
        n_jobs=-1
    )
    ensemble.fit(X_train, y_train)
    gc.collect()
    final_preds1 = ensemble.predict_proba(X_test)[:, 1]
    print(final_preds1)

[0.54291002 0.55637294 0.6347229  0.59797108 0.527079   0.63198155
 0.62015758 0.57555722 0.5988353  0.57504675 0.57388124 0.58702718
 0.62123113 0.61658833 0.59907856 0.54438224 0.60998554 0.63955693
 0.60674103 0.63735445 0.670477   0.61250974 0.5922073  0.70250016
 0.59531311 0.61463299 0.61950628 0.63628117 0.62172642 0.64291536
 0.61540233 0.59996019 0.63624397 0.58798119 0.53836085 0.58111776
 0.59803494 0.61696371 0.57783281 0.62621101 0.68290536 0.66835786
 0.65123665 0.62247822 0.58187826 0.66612308 0.65301556 0.56523692
 0.60966133 0.55008195 0.63267952 0.57593313 0.70959765 0.66357961
 0.6112364  0.61158293 0.63166446 0.62714442 0.67722951 0.63997403
 0.61324349 0.62620266 0.52328662 0.50371779 0.59142339 0.65735504
 0.61187449 0.621005   0.56201223 0.6237869  0.62712722 0.62649503
 0.61058237 0.66892744 0.60874938 0.65104842 0.61402145 0.5523276
 0.59716871 0.54619571 0.63909745 0.68620415 0.5933087  0.63483033
 0.62877053 0.61708232 0.57919023 0.60577338 0.62802357 0.65638

### 训练第二种tokenizer，不使用min_df

In [8]:
def dummy(text):
    return text


vectorizer3 = TfidfVectorizer(
    ngram_range=(3, 3),
    lowercase=False,
    sublinear_tf=True,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

vectorizer3.fit(tokenized_texts_test)
vocab = vectorizer3.vocabulary_
print(len(vocab))

vectorizer3 = TfidfVectorizer(
    ngram_range=(3, 3),
    lowercase=False,
    sublinear_tf=True,
    vocabulary=vocab,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

X_train3 = vectorizer3.fit_transform(tokenized_texts_train)
y_train3 = train["label"].values
X_test3 = vectorizer3.transform(tokenized_texts_test)
print("The shape of X_train3 is:", X_train3.shape)
print("The shape of y_train3 is:", y_train3.shape)
print("The shape of X_test3 is:", X_test3.shape)

del vectorizer3
gc.collect()

num_features = X_train3.shape[1]
k = int(num_features / 2)
chi2_selector = SelectKBest(chi2, k=k)
X_train3 = chi2_selector.fit_transform(X_train3, y_train3)
X_test3 = chi2_selector.transform(X_test3)
print("The shape of X_train3 is:", X_train3.shape)
print("The shape of y_train3 is:", y_train3.shape)
print("The shape of X_test3 is:", X_test3.shape)

632
The shape of X_train3 is: (44868, 632)
The shape of y_train3 is: (44868,)
The shape of X_test3 is: (100, 632)
The shape of X_train3 is: (44868, 316)
The shape of y_train3 is: (44868,)
The shape of X_test3 is: (100, 316)


In [9]:
def dummy(text):
    return text


vectorizer4 = TfidfVectorizer(
    ngram_range=(4, 4),
    lowercase=False,
    sublinear_tf=True,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

vectorizer4.fit(tokenized_texts_test)
vocab = vectorizer4.vocabulary_
print(len(vocab))

vectorizer4 = TfidfVectorizer(
    ngram_range=(4, 4),
    lowercase=False,
    sublinear_tf=True,
    vocabulary=vocab,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

X_train4 = vectorizer4.fit_transform(tokenized_texts_train)
y_train4 = train["label"].values
X_test4 = vectorizer4.transform(tokenized_texts_test)
print("The shape of X_train4 is:", X_train4.shape)
print("The shape of y_train4 is:", y_train4.shape)
print("The shape of X_test4 is:", X_test4.shape)

del vectorizer4
gc.collect()

num_features = X_train4.shape[1]
k = int(num_features / 2)
chi2_selector = SelectKBest(chi2, k=k)
X_train4 = chi2_selector.fit_transform(X_train4, y_train4)
X_test4 = chi2_selector.transform(X_test4)
print("The shape of X_train4 is:", X_train4.shape)
print("The shape of y_train4 is:", y_train4.shape)
print("The shape of X_test4 is:", X_test4.shape)

910
The shape of X_train4 is: (44868, 910)
The shape of y_train4 is: (44868,)
The shape of X_test4 is: (100, 910)
The shape of X_train4 is: (44868, 455)
The shape of y_train4 is: (44868,)
The shape of X_test4 is: (100, 455)


In [10]:
def dummy(text):
    return text


vectorizer5 = TfidfVectorizer(
    ngram_range=(5, 5),
    lowercase=False,
    sublinear_tf=True,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

vectorizer5.fit(tokenized_texts_test)
vocab = vectorizer5.vocabulary_
print(len(vocab))

vectorizer5 = TfidfVectorizer(
    ngram_range=(5, 5),
    lowercase=False,
    sublinear_tf=True,
    vocabulary=vocab,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

X_train5 = vectorizer5.fit_transform(tokenized_texts_train)
y_train5 = train["label"].values
X_test5 = vectorizer5.transform(tokenized_texts_test)
print("The shape of X_train5 is:", X_train5.shape)
print("The shape of y_train5 is:", y_train5.shape)
print("The shape of X_test5 is:", X_test5.shape)

del vectorizer5
gc.collect()

num_features = X_train5.shape[1]
k = int(num_features / 2)
chi2_selector = SelectKBest(chi2, k=k)
X_train5 = chi2_selector.fit_transform(X_train5, y_train5)
X_test5 = chi2_selector.transform(X_test5)
print("The shape of X_train5 is:", X_train5.shape)
print("The shape of y_train5 is:", y_train5.shape)
print("The shape of X_test5 is:", X_test5.shape)

1106
The shape of X_train5 is: (44868, 1106)
The shape of y_train5 is: (44868,)
The shape of X_test5 is: (100, 1106)
The shape of X_train5 is: (44868, 553)
The shape of y_train5 is: (44868,)
The shape of X_test5 is: (100, 553)


In [11]:
X_train = hstack([X_train3, X_train4, X_train5])
X_test = hstack([X_test3, X_test4, X_test5])
y_train = train["label"].values
print("The shape of X_train is:", X_train.shape)
print("The shape of y_train is:", y_train.shape)
print("The shape of X_test is:", X_test.shape)

The shape of X_train is: (44868, 1324)
The shape of y_train is: (44868,)
The shape of X_test is: (100, 1324)


In [14]:
if len(test.text.values) <= 2:
    sub.to_csv("submission.csv", index=False)
else:
    lgb_params = {
        "n_iter": 3000,
        "verbose": 1,
        "objective": "cross_entropy",
        "metric": "auc",
        "learning_rate": 0.0056,
        "colsample_bytree": 0.7,
        "colsample_bynode": 0.8
    }
    lgb = LGBMClassifier(**lgb_params)

    cat = CatBoostClassifier(
        iterations=3000,
        verbose=1,
        learning_rate=0.0056,
        subsample=0.4,
        allow_const_label=True,
        loss_function="CrossEntropy"
    )

    xgb_params = {
        "n_estimators": 3000,
        "verbosity": 1,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "learning_rate": 0.005,
        "colsample_bytree": 0.6,
        "random_state": 42
    }
    xgb = XGBClassifier(**xgb_params)

    weights = [0.28, 0.67]

    ensemble = VotingClassifier(
        estimators=[
            ("lgb", lgb),
            ("cat", cat)
        ],
        weights=weights,
        voting="soft",
        n_jobs=-1
    )
    ensemble.fit(X_train, y_train)
    gc.collect()
    final_preds2 = ensemble.predict_proba(X_test)[:, 1]
    print(final_preds2)

[0.56038317 0.55045213 0.56038317 0.56038317 0.56038317 0.56038317
 0.55822735 0.56038317 0.55822735 0.55822735 0.55045213 0.56038317
 0.56038317 0.56090667 0.55822735 0.56038317 0.56038317 0.56038317
 0.56038317 0.56038317 0.55077559 0.56038317 0.56038317 0.56376587
 0.56090667 0.56038317 0.56038317 0.56038317 0.56038317 0.56038317
 0.56038317 0.56038317 0.56038317 0.55822735 0.56038317 0.56038317
 0.56038317 0.55822735 0.56038317 0.56038317 0.56090667 0.56376587
 0.56038317 0.55045213 0.56038317 0.55045213 0.56038317 0.56038317
 0.56038317 0.56038317 0.56038317 0.56090667 0.56376587 0.56376587
 0.56090667 0.56038317 0.56376587 0.55822735 0.56376587 0.56038317
 0.56038317 0.56038317 0.55822735 0.56038317 0.55441531 0.56376587
 0.55822735 0.56038317 0.56038317 0.56038317 0.56038317 0.56038317
 0.56038317 0.55045213 0.56038317 0.56376587 0.55045213 0.56038317
 0.55822735 0.56038317 0.56038317 0.54051749 0.56038317 0.56038317
 0.55822735 0.56038317 0.56038317 0.56038317 0.56038317 0.5507

In [13]:
sub["generated"] = final_preds1 * 0.30 + final_preds2 * 0.70
sub.to_csv("submission.csv", index=False)
sub

ValueError: Length of values (100) does not match length of index (3)