In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier


from transformers import PreTrainedTokenizerFast
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)


from datasets import Dataset
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import sys
import gc

import nltk
import spacy
import string
import pickle

import re
from collections import Counter
import os
sys.path.append("../lib")
from leven_search import LevenSearch, EditCost, EditCostConfig, GranularEditCostConfig
if os.path.isdir("/kaggle/input"):
    IS_KAGGLE = True
    !pip install /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl

    !mkdir /tmp/corpora
    !cp -r /kaggle/input/wordnet/wordnet /tmp/corpora
    !pip install /kaggle/input/swifter-1-4-0/wheelhouse/swifter-1.4.0-py3-none-any.whl  
    nltk.data.path.append("/tmp")  
    with open('/kaggle/usr/lib/install_levenshtein_search_library/leven_search.pkl', 'rb') as file:
        lev_search = pickle.load(file)    
else:
    IS_KAGGLE = False
    with open("../lib/leven_search.pkl", "rb") as file:
        lev_search = pickle.load(file)



from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from spellchecker import SpellChecker


import swifter

from joblib import Parallel, delayed

In [None]:
# Fast save
IS_RERUN = False
if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    IS_RERUN = True
    pass
else:
    try:
        sub = pd.read_csv(
            "/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv"
        )
        sub.to_csv("submission.csv", index=False)
    except:
        sub = pd.read_csv("../input/llm-detect-ai-generated-text/sample_submission.csv")
        sub.to_csv("submission.csv", index=False)
if (not IS_RERUN) and (IS_KAGGLE):
    sys.exit()

# Parameters

In [None]:
LOWER_CASE = True
REMOVE_PUNCT = True
REMOVE_STOP_WORDS = True
REMOVE_FREQ_WORDS = False
REMOVE_RARE_WORDS = False
STEM_WORDS = True
LEMMATIZE = True
REMOVE_EMOJI = False
CONVERT_EMOJI = True
REMOVE_URLS = True
REMOVE_HTML = True
CHATWORD_CONV = True
SPELL_CORRECT = True

N_PROCESSES = 4

In [None]:
if IS_KAGGLE:
    DATA_PATH = "/kaggle/input"
    train_path = (
        f"{DATA_PATH}/llm-daigt-5fold-split-seed7-train/train_v2_drcat_02_refined.csv"
    )
else:
    DATA_PATH = "../input"
    train_path = f"{DATA_PATH}/daigt-v2-train-dataset/train_v2_drcat_02.csv"
test = pd.read_csv(f"{DATA_PATH}/llm-detect-ai-generated-text/test_essays.csv")
sub = pd.read_csv(f"{DATA_PATH}/llm-detect-ai-generated-text/sample_submission.csv")
train = pd.read_csv(train_path, sep=",")
train = train.drop_duplicates(subset=["text"])
train.reset_index(drop=True, inplace=True)
train.head()
excluded_prompt_name_list = [
    "Distance learning",
    "Grades for extracurricular activities",
    "Summer projects",
]
train = train[~(train["prompt_name"].isin(excluded_prompt_name_list))]
train = train.drop_duplicates(subset=["text"])
train.reset_index(drop=True, inplace=True)

# Text preprocessing
Ref: https://www.kaggle.com/code/sudalairajkumar/getting-started-with-text-preprocessing

In [None]:
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

In [None]:
PUNCT_TO_REMOVE = string.punctuation


def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans("", "", PUNCT_TO_REMOVE))


", ".join(stopwords.words("english"))

STOPWORDS = set(stopwords.words("english"))


def remove_stopwords(text):
    """custom function to remove the st
    opwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])


stemmer = PorterStemmer()


def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])


lemmatizer = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}


def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join(
        [
            lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN))
            for word, pos in pos_tagged_text
        ]
    )


# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(string):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", string)


def remove_urls(text):
    url_pattern = re.compile(r"https?://\S+|www\.\S+")
    return url_pattern.sub(r"", text)


def remove_html(text):
    html_pattern = re.compile("<.*?>")
    return html_pattern.sub(r"", text)


chat_words_map_dict = {}
chat_words_list = []
for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)


def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)


spell = SpellChecker()


def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(filter(None, corrected_text))

In [None]:
def fix_text(origin_text):
    update_stats = {}
    split_texts = origin_text.split()

    fixed_split_texts = []
    for text in split_texts:
        dist_result = lev_search.find_dist(text, max_distance=1).words
        # print(dist_result)
        if len(dist_result) == 1:
            fixed_split_texts.append(dist_result[list(dist_result.keys())[0]].word)
        else:
            fixed_split_texts.append(text)
        right_word = False
        for k in dist_result:
            cur_mod = str(dist_result[k].updates)
            if cur_mod == "[]":  # if right, pass
                right_word = True
                break
        if right_word:
            continue
        for k in dist_result:
            cur_mod = str(dist_result[k].updates)
            if cur_mod not in update_stats.keys():
                update_stats[cur_mod] = 1
            else:
                update_stats[cur_mod] += 1

    max_freq = -1
    for k, v in update_stats.items():
        if v > max_freq:
            max_freq = v
            max_conv = k
    # print(max_conv[1:-1])

    if max_freq >= int(0.06 * len(split_texts)):
        max_freq_change = str(max_conv[1:-1]).split()

        gec = GranularEditCostConfig(
            default_cost=10,
            edit_costs=[EditCost(max_freq_change[0], max_freq_change[-1], 1)],
        )

        reupdate_stats = {}
        refixed_split_texts = []
        for text in fixed_split_texts:
            dist_result = lev_search.find_dist(
                text, max_distance=9, edit_cost_config=gec
            ).words
            if len(dist_result) > 0:  # make sense is better than not make sense things
                refixed_split_texts.append(
                    dist_result[list(dist_result.keys())[0]].word
                )
            # else:
            #    refixed_split_texts.append(text)
            right_word = False
            for k in dist_result:
                cur_mod = str(dist_result[k].updates)
                if cur_mod == "[]":  # if right, pass
                    right_word = True
                    break
            if right_word:
                continue
            for k in dist_result:
                cur_mod = str(dist_result[k].updates)
                if cur_mod not in reupdate_stats.keys():
                    reupdate_stats[cur_mod] = 1
                else:
                    reupdate_stats[cur_mod] += 1
        fixed_split_texts = refixed_split_texts
    return " ".join(fixed_split_texts)


# origin_text = "Ai one poini I believed ihe elecioral college was a bad idea"

In [None]:
def correct_spellings(train, n_processes=N_PROCESSES):
    fixed_texts = []
    rng_texts = np.linspace(0, len(train), n_processes + 1, dtype=int)

    def process_wrapper(process_ind, rng_texts=rng_texts, train=train):
        fixed_texts = []
        for i in tqdm(range(rng_texts[process_ind], rng_texts[process_ind + 1])):
            fixed_texts.append(fix_text(train.iloc[i].text))
        return fixed_texts

    fixed_textss = Parallel(n_jobs=n_processes)(
        delayed(process_wrapper)(i) for i in range(n_processes)
    )
    for i in range(1, len(fixed_textss)):
        fixed_textss[0] += fixed_textss[i]
    del fixed_textss[1:]
    gc.collect()
    return fixed_textss[0]

In [None]:
def preprocess_text_df(df):
    if LOWER_CASE:
        df["text"] = df["text"].str.lower()
    if REMOVE_PUNCT:
        df["text"] = df["text"].swifter.apply(lambda text: remove_punctuation(text))
    if REMOVE_STOP_WORDS:
        df["text"] = df["text"].swifter.apply(lambda text: remove_stopwords(text))

    if SPELL_CORRECT:
        df["text"] = correct_spellings(df)
    cnt = Counter()
    for text in df["text"].values:
        for word in text.split():
            cnt[word] += 1

    FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])

    def remove_freqwords(text):
        """custom function to remove the frequent words"""
        return " ".join([word for word in str(text).split() if word not in FREQWORDS])

    if REMOVE_FREQ_WORDS:
        df["text"] = df["text"].swifter.apply(lambda text: remove_freqwords(text))

    n_rare_words = 10
    RAREWORDS = set([w for (w, wc) in cnt.most_common()[: -n_rare_words - 1 : -1]])

    def remove_rarewords(text):
        """custom function to remove the rare words"""
        return " ".join([word for word in str(text).split() if word not in RAREWORDS])

    if REMOVE_RARE_WORDS:
        df["text"] = df["text"].swifter.apply(lambda text: remove_rarewords(text))
    if STEM_WORDS:
        df["text"] = df["text"].swifter.apply(lambda text: stem_words(text))
    if LEMMATIZE:
        df["text"] = df["text"].swifter.apply(lambda text: lemmatize_words(text))
    if REMOVE_URLS:
        df["text"] = df["text"].swifter.apply(lambda text: remove_urls(text))
    if REMOVE_HTML:
        df["text"] = df["text"].swifter.apply(lambda text: remove_urls(text))
    if CHATWORD_CONV:
        df["text"] = df["text"].swifter.apply(lambda text: chat_words_conversion(text))

    return df

In [None]:
test

In [None]:
if not IS_KAGGLE:
    train = preprocess_text_df(train)
    train.to_csv(
        f"{DATA_PATH}/daigt-v2-train-dataset/train_v2_drcat_02_refined.csv", index=False
    )
test = preprocess_text_df(test)

In [None]:
LOWERCASE = False
VOCAB_SIZE = 14_000_000

In [None]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))


# Adding normalization and pre_tokenizer
raw_tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else []
)


raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

# Adding special tokens and creating trainer instance
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)


# Creating huggingface dataset object
dataset = Dataset.from_pandas(test[["text"]])


def train_corp_iter():
    """
    A generator function for iterating over a dataset in chunks.
    """
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]


# Training from iterator REMEMBER it's training on test set...
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)


# Tokenize test set with new tokenizer
tokenized_texts_test = []
for text in tqdm(test["text"].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))


# Tokenize train set
tokenized_texts_train = []
for text in tqdm(train["text"].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

In [None]:
print(tokenized_texts_test[1])
print()
print(tokenized_texts_test[2])

In [None]:
def dummy(text):
    """
    A dummy function to use as tokenizer for TfidfVectorizer.
    It returns the text as it is since we already tokenized it.
    """
    return text


# Fitting TfidfVectoizer on test set
vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode",
)


vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_
print(vocab)


# Here we fit our vectorizer on train set but this time we use vocabulary from test fit.
vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    vocabulary=vocab,
    analyzer="word",
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents="unicode",
)

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)
del vectorizer
gc.collect()

print(tf_train.shape)
print(tf_test.shape)

In [None]:
train

In [None]:
y_train_label = train["label"].values

In [None]:
tf_test.shape

In [None]:
if len(test.text.values) <= 5:
    sub.to_csv("submission.csv", index=False)
else:
    clf = MultinomialNB(alpha=0.0225)

    sgd_model = SGDClassifier(
        max_iter=9000, tol=1e-4, random_state=6743, loss="modified_huber"
    )

    p = {
        "verbose": -1,
        "n_iter": 3000,
        "colsample_bytree": 0.7800,
        "colsample_bynode": 0.8000,
        "random_state": 6743,
        "metric": "auc",
        "objective": "cross_entropy",
        "learning_rate": 0.00581909898961407,
    }
    lgb = LGBMClassifier(**p)

    cat = CatBoostClassifier(
        iterations=3000,
        verbose=0,
        subsample=0.35,
        random_seed=6543,
        allow_const_label=True,
        loss_function="CrossEntropy",
        learning_rate=0.005599066836106983,
    )

    ensemble = VotingClassifier(
        estimators=[("mnb", clf), ("sgd", sgd_model), ("lgb", lgb), ("cat", cat)],
        weights=[0.1, 0.31, 0.28, 0.67],
        voting="soft",
        n_jobs=-1,
    )

    ensemble.fit(tf_train, y_train_label)
    gc.collect()

    final_preds = ensemble.predict_proba(tf_test)[:, 1]
    sub["generated"] = final_preds
    sub.to_csv("submission.csv", index=False)
    sub.head()

In [None]:
tf_train.shape