In [1]:
import sys
import gc

import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score

import lightgbm as lgb
from lightgbm import LGBMClassifier

from catboost import CatBoostClassifier

from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

**Explanation:**
This little piece of code reads data into pandas DataFrames from CSV files. Data from the test essays.csv file is contained in the test DataFrame, while data from the sample submission.csv file is contained in the sub DataFrame.

In [2]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')

# train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')
train = pd.read_csv("/kaggle/input/daigt-v2-spellcheck/daigt-v2_spellcheck.csv", sep=',')

In [3]:
train = train.loc[((train['label']==0) & (train['RDizzl3_seven']==1)) | (train['label']==1)].reset_index()

In [4]:
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

In [5]:
train.shape

(31747, 6)

In [6]:
train.label.value_counts()

label
1    17497
0    14250
Name: count, dtype: int64

This code sample eliminates duplicate rows based on the text column, resets the DataFrame's index for consistency and clarity, and filters out rows from the train DataFrame where the prompt name is in the excluded_prompt_name_list.

In [7]:
excluded_prompt_name_list = ['Distance learning','Grades for extracurricular activities','Summer projects']
train = train[~(train['prompt_name'].isin(excluded_prompt_name_list))]
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

This section of code imports regular expressions, text manipulation modules, and the Levenshtein distance method; it is possible that string matching or similarity computations are part of the code that follows.

In [8]:
from collections import defaultdict
import re
from leven_search import LevenSearch, EditCost, EditCostConfig, GranularEditCostConfig

In [9]:
import pickle
with open('/kaggle/usr/lib/install_levenshtein_search_library/leven_search.pkl', 'rb') as file:
    lev_search = pickle.load(file)

In [10]:
def sentence_correcter(text):
    dict_freq = defaultdict(lambda :0)
    
    wrong_words = []
    correct_words = dict()
    word_list = re.findall(r'\b\w+\b|[.,\s]', text)
    
    for t in word_list:
        correct_word = t
    
        if len(t)>2:
            result = lev_search.find_dist(t, max_distance=0)
            result = list(result.__dict__['words'].values())
    
            if len(result) == 0:
                result = lev_search.find_dist(t, max_distance=1)
                result = list(result.__dict__['words'].values())
                if len(result):
                    correct_word = result[0].word
                    wrong_words.append((t, result))
    
        correct_words[t] = correct_word
               
    for wrong_word in wrong_words:
        _, result = wrong_word
    
        for res in result:
            updates = res.updates
            parts = str(updates[0]).split(" -> ")
            if len(parts) == 2:
                from_char = parts[0]
                to_char = parts[1]
                dict_freq[(from_char, to_char)] += 1
    
    if len(dict_freq):
        max_key = max(dict_freq, key=dict_freq.get)
        count = dict_freq[max_key]
    else:
        count = 0
    
    if count > 0.06*len(text.split()):
        gec = GranularEditCostConfig(default_cost=10, edit_costs=[EditCost(max_key[0], max_key[1], 1)])
    
        for wrong_word in wrong_words:
            word, _ = wrong_word
            result = lev_search.find_dist(word, max_distance=9, edit_cost_config=gec)
            result = list(result.__dict__['words'].values())
            if len(result):
                correct_words[word] = result[0].word
            else:
                correct_word = word
    
    
    correct_sentence = []
    for t in word_list:
        correct_sentence.append(correct_words[t])
    
    return "".join(correct_sentence)

In [11]:
corrected_test = test.loc[:, 'text'].apply(sentence_correcter)
test.loc[:, 'text'] = corrected_test

In [12]:
LOWERCASE = False
VOCAB_SIZE = 14000000

In [13]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
# Adding normalization and pre_tokenizer
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
# Adding special tokens and creating trainer instance
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)
# Creating huggingface dataset object
dataset = Dataset.from_pandas(test[['text']])
def train_corp_iter():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenized_texts_test = []

for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))






  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/26909 [00:00<?, ?it/s]

In [14]:
def dummy(text):
    return text

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode')

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

# print(vocab)

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

21

In [15]:
y_train = train['label'].values

In [16]:
def get_model():

    mnb_model = MultinomialNB(alpha=1)

    sgd_model = SGDClassifier(max_iter=9000, tol=1e-4, loss="modified_huber", random_state=6743)

    param = {
        'n_iter': 3000,
        'verbose': -1,
        'objective': 'cross_entropy',
        'metric': 'auc',
        'learning_rate': 0.00581909898961407, 
        'colsample_bytree': 0.78,
        'colsample_bynode': 0.5,
        'random_state': 6743
       }
    lgb_model = LGBMClassifier(**param)

    cat_model = CatBoostClassifier(
#         iterations=3000,
        iterations=1200,
        verbose=0,
        random_seed=6543,
#         learning_rate=0.005599066836106983,
        learning_rate=0.01,
        subsample = 0.35,
        allow_const_label=True,
        loss_function = 'CrossEntropy'
    )
    
    weights = [0.1,0.31,0.28,0.67]
 
    ensemble = VotingClassifier(estimators=[
        ('mnb_model', mnb_model),
        ('sgd_model', sgd_model),
        ('lgb_model', lgb_model), 
        ('cat_model', cat_model)
        ],
                                weights=weights, voting='soft', n_jobs=-1)
    return ensemble

In [17]:
model = get_model()
print(model)

if len(test.text.values) <= 5:
    # if not, just sample submission
    sub.to_csv('submission.csv', index=False)
else:
    model.fit(tf_train, y_train)

    final_preds = model.predict_proba(tf_test)[:,1]
    sub['generated'] = final_preds
    sub.to_csv('submission.csv', index=False)
    sub

VotingClassifier(estimators=[('mnb_model', MultinomialNB(alpha=1)),
                             ('sgd_model',
                              SGDClassifier(loss='modified_huber',
                                            max_iter=9000, random_state=6743,
                                            tol=0.0001)),
                             ('lgb_model',
                              LGBMClassifier(colsample_bynode=0.5,
                                             colsample_bytree=0.78,
                                             learning_rate=0.00581909898961407,
                                             metric='auc', n_iter=3000,
                                             objective='cross_entropy',
                                             random_state=6743, verbose=-1)),
                             ('cat_model',
                              <catboost.core.CatBoostClassifier object at 0x7b88a4f1cc40>)],
                 n_jobs=-1, voting='soft', weights=[0.1, 0.31, 0.28, 0