# Meta Classifier

The idea is to use another model to fit the predictions instead of a simple voting classifier.

In [26]:
import sys
import gc
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from tqdm.notebook import tqdm
from scipy.sparse import hstack, csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

LGBM Parameters taken from : https://www.kaggle.com/code/siddhvr/llm-daigt-sub

In [2]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
org_train = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')

In [3]:
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

In [4]:
LOWERCASE = False
VOCAB_SIZE = 15000000
OFFLINE = False

In [5]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)
dataset = Dataset.from_pandas(test[['text']])
def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenized_texts_test = []

for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))






  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/44868 [00:00<?, ?it/s]

In [6]:
tokenized_texts_test[1]

['ĠBbb', 'Ġccc', 'Ġddd', '.']

In [7]:
def dummy(text):
    return text

In [8]:
%%time
vectorizer = TfidfVectorizer(
    ngram_range=(3, 5), 
    lowercase=False, 
    sublinear_tf=True, 
    analyzer='word',
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None, 
    strip_accents='unicode')

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

print(vocab)

vectorizer = TfidfVectorizer(
    ngram_range=(3, 5), 
    lowercase=False, 
    sublinear_tf=True, 
    vocabulary=vocab,
    analyzer='word',
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None, 
    strip_accents='unicode'
)

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

{'ĠAaa Ġbbb Ġccc': 0, 'Ġbbb Ġccc .': 6, 'ĠAaa Ġbbb Ġccc .': 1, 'ĠBbb Ġccc Ġddd': 2, 'Ġccc Ġddd .': 7, 'ĠBbb Ġccc Ġddd .': 3, 'ĠCCC Ġddd Ġeee': 4, 'Ġddd Ġeee .': 8, 'ĠCCC Ġddd Ġeee .': 5}
CPU times: user 4min 6s, sys: 153 ms, total: 4min 6s
Wall time: 4min 6s


51

In [9]:
y_train = train['label'].values

# Cohesion Features V2

In [10]:
# def word_diversity(text):
#     tk = word_tokenize(text)
#     return len(set(tk)) / len(tk)


# def sentence_length(text):
#     tk = sent_tokenize(text)
#     return [len(s) for s in tk]


# def generate_cohesion_features(data):
#     features = []
#     data['word_diversity'] = data['text'].apply(lambda x: word_diversity(x))
#     data['sentence_len'] = data['text'].apply(lambda x: sentence_length(x))
#     data['sentence_cnt'] = data['sentence_len'].apply(lambda x: len(x))
#     features.extend(['word_diversity', 'sentence_cnt'])
    
#     ops = [('mean', np.mean), ('max', np.max), ('min', np.min), ('median', np.median)]
#     for op, func in ops:
#         col = f'sentence_{op}'
#         data[col] = data['sentence_len'].apply(lambda x: func(x))
#         features.append(col)
#     return data[features]

In [11]:
# %%time
# train_co = generate_cohesion_features(train)
# test_co = generate_cohesion_features(test)

# print('train shape', train_co.shape, train_co.sample(3))

In [12]:
# if OFFLINE:
#     print("Training offline")
#     skf = StratifiedKFold(3)
#     scores = dict()

#     for i, (train_idx, val_idx) in enumerate(skf.split(train_co, y_train)):
#         train_X_, train_y_ = train_co.iloc[train_idx], y_train[train_idx]
#         val_X_, val_y_ = train_co.iloc[val_idx], y_train[val_idx]
#         print(f"Fold {i+1}: train shape {train_X_.shape} | val shape {val_X_.shape}")

#         clf = LGBMClassifier()
#         clf.fit(train_X_, train_y_)
#         pred_ = clf.predict_proba(val_X_)[:, 1]

#         auc = roc_auc_score(val_y_, pred_)
#         scores[f"fold_{i+1}"] = auc
#         print(f"Fold {i+1} auc {auc}")

#     mean_auc = np.mean(list(scores.values()))
#     print("AUC on training set is", mean_auc)
# else:
#     print("Training full online")
#     clf = LGBMClassifier()
#     clf.fit(train_co, y_train)
#     pred_cohesion = clf.predict_proba(test_co)[:, 1]
#     print(pred_cohesion)

## Filter Train data Based on Test Stats

In [13]:
# # Filter by word diversity and sentence count
# wd_min = test_co['word_diversity'].min()
# wd_max = test_co['word_diversity'].max()
# sc_min = test_co['sentence_cnt'].min()
# sc_max = test_co['sentence_cnt'].max()

# cond_ = (wd_min <= train_co['word_diversity']) & (train_co['word_diversity'] <= wd_max) & \
#         (sc_min <= train_co['sentence_cnt']) & (train_co['sentence_cnt'] <= sc_max)

# idx = train_co[cond_].index
# tf_train = tf_train[idx]
# print("Filtered training set", tf_train.shape)

## Horizontal stack to TF data

In [14]:
# if not OFFLINE:
#     print("Stack cohesion features")
# #     scaler = StandardScaler()
# #     scaler = MinMaxScaler()
# #     test_co_scaled = scaler.fit_transform(test_co)
# #     train_co_scaled = scaler.transform(train_co)
    
#     tf_train_en = hstack([tf_train, train_co[['word_diversity']]])
#     tf_test_en = hstack([tf_test, test_co[['word_diversity']]])
#     print("tf train shape", tf_train_en.shape)

# Embedding Features V3

In [15]:
from transformers import DebertaV2Tokenizer, DebertaV2Model
import torch

In [19]:
model_checkpoint = "/kaggle/input/huggingfacedebertav3variants/deberta-v3-xsmall"

# Load pre-trained model tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained(model_checkpoint)
model = DebertaV2Model.from_pretrained(model_checkpoint)

model.eval()
model.to('cuda')

DebertaV2Model(
  (embeddings): DebertaV2Embeddings(
    (word_embeddings): Embedding(128100, 384, padding_idx=0)
    (LayerNorm): LayerNorm((384,), eps=1e-07, elementwise_affine=True)
    (dropout): StableDropout()
  )
  (encoder): DebertaV2Encoder(
    (layer): ModuleList(
      (0-11): 12 x DebertaV2Layer(
        (attention): DebertaV2Attention(
          (self): DisentangledSelfAttention(
            (query_proj): Linear(in_features=384, out_features=384, bias=True)
            (key_proj): Linear(in_features=384, out_features=384, bias=True)
            (value_proj): Linear(in_features=384, out_features=384, bias=True)
            (pos_dropout): StableDropout()
            (dropout): StableDropout()
          )
          (output): DebertaV2SelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-07, elementwise_affine=True)
            (dropout): StableDropout()
          )
        )
        (intermedia

In [27]:
def get_deberta_embeddings(batch_texts):
    # Process a batch of texts and return their embeddings
    inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to('cuda')
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings


def get_embeddings(data, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(data['text']), batch_size), desc="Generating embeddings"):
        batch_texts = data['text'][i:i+batch_size].tolist()
        batch_embeddings = get_deberta_embeddings(batch_texts)
        embeddings.extend(batch_embeddings)

    return np.vstack(embeddings)
    

res = get_deberta_embeddings(train['text'].iloc[0])
print(len(train['text'].iloc[0].split(' ')), res.shape)

378 (1, 384)


In [25]:
%%time

train_embeddings = get_embeddings(train, batch_size=100)
print("Train embeddings done")
test_embeddings = get_embeddings(test, batch_size=100)
print("Test embeddings done")

bert_train_sparse = csr_matrix(train_embeddings)
bert_test_sparse = csr_matrix(test_embeddings)

# Concatenate deBERTa embeddings with TF-IDF features
tf_train = hstack([tf_train, bert_train_sparse])
tf_test = hstack([tf_test, bert_test_sparse])

print("Train shape", tf_train.shape)

Generating embeddings:   0%|          | 0/449 [00:00<?, ?it/s]

Train embeddings done


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings done


NameError: name 'sparse' is not defined

In [None]:
# clf_nb = MultinomialNB(alpha=0.02)
# clf_sgd = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 

# p6 = {'n_iter': 1500, 'verbose': -1,'objective': 'binary',
#       'metric': 'auc','learning_rate': 0.05073909898961407, 
#       'colsample_bytree': 0.726023996436955, 'colsample_bynode': 0.5803681307354022, 
#       'lambda_l1': 8.562963348932286, 'lambda_l2': 4.893256185259296, 
#       'min_data_in_leaf': 115, 'max_depth': 23, 'max_bin': 898}
# clf_lgb = LGBMClassifier(**p6)
# clf_cat = CatBoostClassifier(
#     iterations=1000,
#     verbose=0,
#     l2_leaf_reg=6.6591278779517808,
#     learning_rate=0.005689066836106983,
#     allow_const_label=True)


# class MetaClassifier:
#     def __init__(self, clfs):
#         self.clfs = clfs
#         self.meta_clf = LogisticRegression(penalty='l2', solver='liblinear', C=0.5)
            
#     def fit(self, tf_train, y_train):
#         for clf in self.clfs:
#             clf.fit(tf_train, y_train)
#         print("Individual model fitting done")
        
#         meta_feats = self._stack_preds(tf_train)
#         self.meta_clf.fit(meta_feats, y_train)
#         print("Meta model fitting done")
    
#     def predict_proba(self, tf_test):
#         meta_feats = self._stack_preds(tf_test)
#         return self.meta_clf.predict_proba(meta_feats)
    
#     def _stack_preds(self, data):
#         preds = []
#         for clf in self.clfs:
#             pred = clf.predict_proba(data)[:, 1]
#             preds.append(pred)
#         return np.column_stack(preds)


    
# clfs = [clf_nb, clf_sgd, clf_lgb, clf_cat]
# meta_clf = MetaClassifier(clfs)

# meta_clf.fit(tf_train, y_train)
# final_preds = meta_clf.predict_proba(tf_test)[:, 1]
# sub['generated'] = final_preds * 0.9 + pred_cohesion * 0.1
# sub.to_csv('submission.csv', index=False)
# sub

In [31]:
if len(test.text.values) <= 5:
    sub.to_csv('submission.csv', index=False)
else:
#     clf = MultinomialNB(alpha=0.02)
    clf = LogisticRegression(solver='liblinear', C=0.5)
    sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
    p6 = {'n_iter': 1500,'verbose': -1,'objective': 'binary',
          'metric': 'auc','learning_rate': 0.05073909898961407, 
          'colsample_bytree': 0.726023996436955, 'colsample_bynode': 0.5803681307354022, 
          'lambda_l1': 8.562963348932286, 'lambda_l2': 4.893256185259296, 
          'min_data_in_leaf': 115, 'max_depth': 23, 'max_bin': 254, "device": "gpu"}
    lgb = LGBMClassifier(**p6)
    cat = CatBoostClassifier(
        iterations=1000,
        verbose=0,
        l2_leaf_reg=6.6591278779517808,
        learning_rate=0.005689066836106983,
        allow_const_label=True,
        task_type='GPU'
    )
    weights = [0.07, 0.31, 0.31, 0.31]
 
    ensemble = VotingClassifier(estimators=[('mnb',clf),
                                            ('sgd', sgd_model),
                                            ('lgb',lgb), 
                                            ('cat', cat)
                                           ],
                                weights=weights, voting='soft', n_jobs=-1)
    ensemble.fit(tf_train, y_train)
    gc.collect()
    final_preds = ensemble.predict_proba(tf_test)[:,1]
    # update with cohesion weight
    sub['generated'] = final_preds # * 0.98 + pred_cohesion * 0.02
    sub.to_csv('submission.csv', index=False)
    print(sub)

LGB done


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

         id  generated
0  0000aaaa   0.029780
1  1111bbbb   0.354078
2  2222cccc   0.365634
