In [1]:
import sys
import gc
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier, early_stopping
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
test = pd.read_csv("test_essays.csv")
sub = pd.read_csv("sample_submission.csv")
org_train = pd.read_csv("train_essays.csv")
train = pd.read_csv("train_v2_drcat_02.csv", sep=",")
train = train.dropna()
print(train.shape)

(44868, 5)


In [3]:
class CFG:
    IS_TRAIN_ON_FULL = True
    HALF_TRAIN_SAMPLE = 22500
    RANDOM_STATE = 42
    LOWER_CASE = False
    VOCAB_SIZE = 32000

In [4]:
if CFG.IS_TRAIN_ON_FULL:
    print("-----Using full training data-----")
    train = train.drop_duplicates(subset=["text"])
    train = train.sample(len(train))
    print("The shape of training dataset is:", train.shape)
    train.reset_index(drop=True, inplace=True)
    print(train.head())
else:
    print("-----Using partial training data-----")
    train = train.drop_duplicates(subset=["text"])
    train_label_0 = train[train["label"] == 0]
    train_label_1 = train[train["label"] == 1]
    train_label_0 = train_label_0.sample(
        CFG.HALF_TRAIN_SAMPLE, random_state=CFG.RANDOM_STATE
    )
    train_label_1 = train_label_1.sample(
        CFG.HALF_TRAIN_SAMPLE, random_state=CFG.RANDOM_STATE
    )
    train = pd.concat([train_label_0, train_label_1])
    train = train.sample(len(train))
    print("The shape of training dataset is:", train.shape)
    train.reset_index(drop=True, inplace=True)
    print(train.head())

-----Using full training data-----
The shape of training dataset is: (44868, 5)
                                                text  label  \
0  When making an important choice, do people eve...      0   
1  Does your school have spy technology? Do you w...      0   
2  Did you know some schools require students to ...      0   
3  Praising students when their work has not yet ...      1   
4  Ummm......Okay, so, uh, Fortnite......I mean, ...      1   

                     prompt_name           source  RDizzl3_seven  
0      Seeking multiple opinions  persuade_corpus          False  
1    Facial action coding system  persuade_corpus           True  
2                Summer projects  persuade_corpus          False  
3                Summer projects    chat_gpt_moth          False  
4  "A Cowboy Who Rode the Waves"      llama2_chat           True  


### 使用bpe_trainer

In [5]:
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFC()] + [normalizers.Lowercase()] if CFG.LOWER_CASE else []
)
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(
    vocab_size=CFG.VOCAB_SIZE,
    special_tokens=special_tokens
)

dataset = Dataset.from_pandas(test[["text"]])


def train_corpus():
    for i in tqdm(range(0, len(dataset), 100)):
        yield dataset[i:i + 100]["text"]


raw_tokenizer.train_from_iterator(train_corpus(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test = []
for text in tqdm(test["text"].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []
for text in tqdm(train["text"].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/44868 [00:00<?, ?it/s]

### 训练第一种vectorizer，其中min_df=2

In [6]:
def dummy(text):
    return text


vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode",
    # min_df=2
)

vectorizer.fit(tokenized_texts_test)
vocab = vectorizer.vocabulary_
print(len(vocab))

vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    vocabulary=vocab,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

X_train = vectorizer.fit_transform(tokenized_texts_train)
y_train = train["label"].values
X_test = vectorizer.transform(tokenized_texts_test)
print("The shape of X_train is:", X_train.shape)
print("The shape of y_train is:", y_train.shape)
print("The shape of X_test is:", X_test.shape)

del vectorizer
gc.collect()

num_features = X_train.shape[1]

# 使用卡方检验选择特征
k = int(num_features / 4)
chi2_selector = SelectKBest(chi2, k=k)
X_train_chi2_selected = chi2_selector.fit_transform(X_train, y_train)
X_test_chi2_selected = chi2_selector.transform(X_test)

# 使用SVD进行降维
n_components = int(num_features / 4)
svd = TruncatedSVD(n_components=n_components)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

X_train = hstack([X_train_chi2_selected, X_train_svd])
X_test = hstack([X_test_chi2_selected, X_test_svd])
print("The shape of X_train is:", X_train.shape)
print("The shape of y_train is:", y_train.shape)
print("The shape of X_test is:", X_test.shape)

9
The shape of X_train is: (44868, 9)
The shape of y_train is: (44868,)
The shape of X_test is: (3, 9)
The shape of X_train is: (44868, 4)
The shape of y_train is: (44868,)
The shape of X_test is: (3, 4)


  self.explained_variance_ratio_ = exp_var / full_var


In [7]:
if len(test.text.values) <= 2:
    sub.to_csv("submission.csv", index=False)
else:
    clf = MultinomialNB(alpha=0.0225)

    sgd_model = SGDClassifier(max_iter=9000, tol=1e-4, loss="modified_huber")

    weights = [0.10, 0.31]

    ensemble = VotingClassifier(
        estimators=[
            ("mnb", clf),
            ("sgd", sgd_model)
        ],
        weights=weights,
        voting="soft",
        n_jobs=-1
    )
    ensemble.fit(X_train, y_train)
    gc.collect()
    final_preds1 = ensemble.predict_proba(X_test)[:, 1]
    print(final_preds1)

[0.3829372 0.3829372 0.3829372]


### 训练第二种tokenizer，不使用min_df

In [8]:
def dummy(text):
    return text


vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

vectorizer.fit(tokenized_texts_test)
vocab = vectorizer.vocabulary_
print(len(vocab))

vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    vocabulary=vocab,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

X_train = vectorizer.fit_transform(tokenized_texts_train)
y_train = train["label"].values
X_test = vectorizer.transform(tokenized_texts_test)
print("The shape of X_train is:", X_train.shape)
print("The shape of y_train is:", y_train.shape)
print("The shape of X_test is:", X_test.shape)

del vectorizer
gc.collect()

num_features = X_train.shape[1]

# 使用卡方检验选择特征
k = int(num_features / 4)
chi2_selector = SelectKBest(chi2, k=k)
X_train_chi2_selected = chi2_selector.fit_transform(X_train, y_train)
X_test_chi2_selected = chi2_selector.transform(X_test)

# 使用SVD进行降维
n_components = int(num_features / 4)
svd = TruncatedSVD(n_components=n_components)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

X_train = hstack([X_train_chi2_selected, X_train_svd])
X_test = hstack([X_test_chi2_selected, X_test_svd])
print("The shape of X_train is:", X_train.shape)
print("The shape of y_train is:", y_train.shape)
print("The shape of X_test is:", X_test.shape)

9


KeyboardInterrupt: 

In [None]:
if len(test.text.values) <= 2:
    sub.to_csv("submission.csv", index=False)
else:
    lgb_params = {
        "n_iter": 3000,
        "verbose": -1,
        "objective": "cross_entropy",
        "metric": "auc",
        "learning_rate": 0.0056,
        "colsample_bytree": 0.7,
        "colsample_bynode": 0.8
    }
    lgb = LGBMClassifier(**lgb_params)

    cat = CatBoostClassifier(
        iterations=3000,
        verbose=0,
        learning_rate=0.0056,
        subsample=0.4,
        allow_const_label=True,
        loss_function="CrossEntropy"
    )

    xgb_params = {
        "n_estimators": 2500,
        "verbosity": 1,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "learning_rate": 0.005,
        "colsample_bytree": 0.6,
        "random_state": 42
    }
    xgb = XGBClassifier(**xgb_params)

    weights = [0.28, 0.67]

    ensemble = VotingClassifier(
        estimators=[
            ("lgb", lgb),
            ("cat", cat)
        ],
        weights=weights,
        voting="soft",
        n_jobs=-1
    )
    ensemble.fit(X_train, y_train)
    gc.collect()
    final_preds2 = ensemble.predict_proba(X_test)[:, 1]
    print(final_preds2)

In [None]:
sub["generated"] = final_preds1 * 0.30 + final_preds2 * 0.70
sub.to_csv("submission.csv", index=False)
sub