In [1]:
import sys
import gc
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier, early_stopping
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier



In [2]:
test = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
sub = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv")
org_train = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
train = pd.read_csv("/kaggle/input/daigt-datasets/filtered_v2_gemini_magic.csv", sep=",")
train = train.dropna()
print(train.shape)

(42487, 3)


In [3]:
class CFG:
    IS_TRAIN_ON_FULL = True
    HALF_TRAIN_SAMPLE = 22500
    RANDOM_STATE = 42
    LOWER_CASE = False
    VOCAB_SIZE = 30522

In [4]:
if CFG.IS_TRAIN_ON_FULL:
    print("-----Using full training data-----")
    train = train.drop_duplicates(subset=["text"])
    train = train.sample(len(train))
    print("The shape of training dataset is:", train.shape)
    train.reset_index(drop=True, inplace=True)
    print(train.head())
else:
    print("-----Using partial training data-----")
    train = train.drop_duplicates(subset=["text"])
    train_label_0 = train[train["label"] == 0]
    train_label_1 = train[train["label"] == 1]
    train_label_0 = train_label_0.sample(
        CFG.HALF_TRAIN_SAMPLE, random_state=CFG.RANDOM_STATE
    )
    train_label_1 = train_label_1.sample(
        CFG.HALF_TRAIN_SAMPLE, random_state=CFG.RANDOM_STATE
    )
    train = pd.concat([train_label_0, train_label_1])
    train = train.sample(len(train))
    print("The shape of training dataset is:", train.shape)
    train.reset_index(drop=True, inplace=True)
    print(train.head())

-----Using full training data-----
The shape of training dataset is: (42487, 3)
                                                text  label  \
0   Do you believe that schools should increase t...      1   
1  When I first heard about the Seagoing Cowboys ...      1   
2  When seeking advice, people often ask more tha...      0   
3  Unmasking the Face on Mars\n\nIf you ever saw ...      0   
4  In my opinion as a scientist at NASA, the "Fac...      1   

                     prompt_name  
0              Distance learning  
1  "A Cowboy Who Rode the Waves"  
2      Seeking multiple opinions  
3               The Face on Mars  
4               The Face on Mars  


### 使用bpe_trainer

In [5]:
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFC()] + [normalizers.Lowercase()] if CFG.LOWER_CASE else []
)
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(
    vocab_size=CFG.VOCAB_SIZE,
    special_tokens=special_tokens
)

dataset = Dataset.from_pandas(test[["text"]])


def train_corpus():
    for i in tqdm(range(0, len(dataset), 100)):
        yield dataset[i:i + 100]["text"]


raw_tokenizer.train_from_iterator(train_corpus(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test = []
for text in tqdm(test["text"].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []
for text in tqdm(train["text"].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

  if _pandas_api.is_sparse(col):


  0%|          | 0/1 [00:00<?, ?it/s]






  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/42487 [00:00<?, ?it/s]

In [6]:
def dummy(text):
    return text


vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode",
    min_df=2
)

vectorizer.fit(tokenized_texts_test)
vocab = vectorizer.vocabulary_
print(len(vocab))

vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    vocabulary=vocab,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

X_train = vectorizer.fit_transform(tokenized_texts_train)
y_train = train["label"].values
X_test = vectorizer.transform(tokenized_texts_test)
print("The shape of X_train is:", X_train.shape)
print("The shape of y_train is:", y_train.shape)
print("The shape of X_test is:", X_test.shape)

del vectorizer
gc.collect()

ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.

In [None]:
if len(test.text.values) <= 2:
    sub.to_csv('submission.csv', index=False)
else:
    clf = MultinomialNB(alpha=0.0225)
    
    sgd_model = SGDClassifier(max_iter=9000, tol=1e-4, loss="modified_huber")

    weights = [0.50, 0.50]
 
    ensemble = VotingClassifier(estimators=[('mnb',clf),
                                            ('sgd', sgd_model)],
                                weights=weights, voting='soft', n_jobs=-1)
    ensemble.fit(X_train, y_train)
    gc.collect()
    final_preds1 = ensemble.predict_proba(X_test)[:,1]
    print(final_preds1)

In [None]:
del X_train
del y_train
del X_test
gc.collect()

In [None]:
train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=",")
train = train.dropna()
train = train.drop_duplicates(subset=["text"])
train.reset_index(drop=True, inplace=True)
print(train.shape)

In [None]:
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFC()] + [normalizers.Lowercase()] if CFG.LOWER_CASE else []
)
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(
    vocab_size=CFG.VOCAB_SIZE,
    special_tokens=special_tokens
)

dataset = Dataset.from_pandas(test[["text"]])


def train_corpus():
    for i in tqdm(range(0, len(dataset), 100)):
        yield dataset[i:i + 100]["text"]


raw_tokenizer.train_from_iterator(train_corpus(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test = []
for text in tqdm(test["text"].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []
for text in tqdm(train["text"].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

In [None]:
def dummy(text):
    return text


vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

vectorizer.fit(tokenized_texts_test)
vocab = vectorizer.vocabulary_
print(len(vocab))

vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    vocabulary=vocab,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode"
)

X_train = vectorizer.fit_transform(tokenized_texts_train)
y_train = train["label"].values
X_test = vectorizer.transform(tokenized_texts_test)
print("The shape of X_train is:", X_train.shape)
print("The shape of y_train is:", y_train.shape)
print("The shape of X_test is:", X_test.shape)

del vectorizer
gc.collect()

In [None]:
if len(test.text.values) <= 2:
    sub.to_csv('submission.csv', index=False)
else:
    p6={
        'n_iter': 6000, 'verbose': -1, 
        'objective': 'cross_entropy', 'metric': 'auc',
        'learning_rate': 0.0056, 'colsample_bytree': 0.78,
        'colsample_bynode': 0.8
    }
    lgb=LGBMClassifier(**p6)
    
    rf_params = {
        "n_estimators": 1500
    }
    rf = RandomForestClassifier(**rf_params)
    
    mlp_params = {
        "hidden_layer_sizes": (8, ),
        "activation": "relu",
        "solver": "adam",
        "alpha": 0.001,
        "learning_rate": "adaptive",
        "max_iter": 100,
        "random_state": 42,
        "verbose": True,
        "early_stopping": True,
        "validation_fraction": 0.1,
        "n_iter_no_change": 10,
        "tol": 0.0001
    }
    mlp = MLPClassifier(**mlp_params)
    
    cat=CatBoostClassifier(
        iterations=1200,
        verbose=0,
        learning_rate=0.02,
        subsample=0.40,
        allow_const_label=True,
        loss_function='CrossEntropy'
    )

    weights = [20.0, 8.0]
 
    ensemble = VotingClassifier(
        estimators=[
#             ('lgb', lgb),
            ('cat', cat),
            ('rf', rf)
#             ('mlp', mlp)
        ],
        weights=weights, 
        voting='soft', 
        n_jobs=-1
    )
    ensemble.fit(X_train, y_train)
    gc.collect()
    final_preds2 = ensemble.predict_proba(X_test)[:,1]
    print(final_preds2)

In [None]:
sub['generated'] = final_preds1 * 0.20 + final_preds2 * 0.80
sub.to_csv('submission.csv', index=False)
sub