In [1]:
!pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"

Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [2]:
%%writefile deobfuscate.py

import spacy
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from collections import Counter
from tqdm import tqdm
from spellchecker import SpellChecker
from nltk import word_tokenize
import gc

spell = SpellChecker()
    
def get_typos_count(input_text):
    words = word_tokenize(input_text)
    misspelled = spell.unknown(words)
    return len(misspelled)

@torch.no_grad()
def clean_essay(text):
    doc = nlp(text)
    inputs = tokenizer([s.text for s in doc.sents], truncation=True, padding=True, return_tensors="pt")
    outputs = deobfuscator.generate(inputs.input_ids.to(DEVICE), max_length=300)
    sents = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return " ".join([s.strip() for s in sents])


MODEL_PATH = "/kaggle/input/essay-gec/deobfuscator-v1"
DEVICE = "cuda:0"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
deobfuscator = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(DEVICE).eval()

nlp = spacy.load("en_core_web_sm")

test = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
test['n_typos'] = test['text'].apply(get_typos_count)
clean_texts = []
for i, r in tqdm(test.iterrows(), total=len(test)):
    if r.n_typos < 15:
        clean_texts.append(r.text)
    else:
        clean_texts.append(clean_essay(r.text))
test["text"] = clean_texts

del deobfuscator, clean_texts
gc.collect()
torch.cuda.empty_cache()

test.to_csv('test_essays.csv', index=False)

Writing deobfuscate.py


In [3]:
%%writefile llm_inference.py

import pandas as pd

from pathlib import Path
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings("ignore")

from tqdm.notebook import tqdm
tqdm.pandas()

import torch
from transformers import AutoTokenizer, AutoConfig

import sys
sys.path.append('/kaggle/input/llm-daig-src-code/src')

from config import load_config, dictionary_to_namespace
from data import clean_text, clean_text2, make_text, CustomDataset, Collator
from models import CustomModel
from training import seed_everything, get_optimizer, get_scheduler, get_valid_steps, metrics, criterion, Trainer


model_names = [
    'exp222', 'exp200', 'exp184', 'exp179',
    'exp477', 'exp478',
    'exp489', 'exp492', 'exp510', 'exp512',
    'exp500',
    'exp511'
]
models_path = Path('/kaggle/input/llm-daig-final-models')

for model_name in model_names:
    df = pd.read_csv('test_essays.csv')
    
    config = load_config(models_path / 'configs' / f'{model_name}.yaml')
    config = dictionary_to_namespace(config)
    seed_everything(config.seed)
    
    config.model.freeze_embeddings = False
    config.dataset.max_length = 1512
    
    df['text_len'] = df['text'].str.len()
    df = df.sort_values('text_len')
    
    if model_name in ['exp222', 'exp200', 'exp184', 'exp179',]:
        df['text'] = df['text'].apply(clean_text2)
    else:
        df['text'] = df['text'].apply(clean_text)
    df = make_text(df, config)

    tokenizer = AutoTokenizer.from_pretrained(models_path / 'tokenizer', use_fast=False)
    config.tokenizer = tokenizer

    dataset = CustomDataset(df, config, train=False)
    collator = Collator(pad_to_multiple_of=0)

    config.dataset.valid_batch_size = 4
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=config.dataset.valid_batch_size,
        num_workers=2,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
        collate_fn=collator,
    )

    backbone_config = AutoConfig.from_pretrained(models_path / 'backbone_configs' / f'{model_name}.json')
    model_criterion = criterion.get_criterion(config)
    model = CustomModel(
        config, 
        init_from_config=True,
        criterion=model_criterion,
        backbone_config=backbone_config
    )
            
    state = torch.load(
        models_path / 'models' / f'{model_name}_weights.pth', 
        map_location=torch.device('cpu')
    )
    model.load_state_dict(state['model'])

    trainer = Trainer(model, config)
    predictions = trainer.predict(dataloader)
    
    df['preds'] = predictions
    exp_name = config.exp_name.split('_')[0]
    df.to_csv(f'submission_{exp_name}.csv', index=False)

Writing llm_inference.py


In [4]:
!python deobfuscate.py
!python llm_inference.py

100%|███████████████████████████████████████████| 3/3 [00:00<00:00, 3986.98it/s]
100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.23s/it]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  4.27it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  3.62it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  2.99it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  3.19it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  3.20it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  3.08it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  3.35it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  3.38it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  2.96it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  3.19it/s]
100%|███████████

In [5]:
import pandas as pd
import numpy as np

exps = ['exp200', 'exp184', 'exp222', 'exp179']

df = pd.read_csv(f'submission_{exps[0]}.csv', usecols=['id', 'text'])

for exp in exps:
    df_temp = pd.read_csv(f'submission_{exp}.csv')
    df_temp = df_temp[['id', 'preds']]
    df_temp.rename(columns={'preds': exp}, inplace=True)
    df = pd.merge(df, df_temp, on='id', how='left')
    
df['generated'] = df[exps].mean(axis=1)

if df[(df['generated'] < 0.01) | (df['generated'] > 0.99)].shape[0] == 0:
    df = df.head(1)
else:
    df = df[(df['generated'] < 0.01) | (df['generated'] > 0.99)]
    
    df.loc[(df['generated'] < 0.01), 'dist'] = df.loc[(df['generated'] < 0.01), 'generated']
    df.loc[(df['generated'] > 0.99), 'dist'] = 1 - df.loc[(df['generated'] > 0.99), 'generated']
    
    df = df.sort_values('dist')
    if df.shape[0] > 1000:
        df = df.head(1000)

df.loc[(df['generated'] < 0.01), 'generated'] = 0
df.loc[(df['generated'] > 0.99), 'generated'] = 1

df = df[['text', 'generated']]
df.rename(columns={'generated': 'label'}, inplace=True)

df.to_csv('pseudo.csv', index=False)

In [6]:
import sys
import gc

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
org_train = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')
pseudo = pd.read_csv('pseudo.csv')
train = pd.concat([train, pseudo])

train['text'] = train['text'].str.strip()
test['text'] = test['text'].str.strip()

train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

LOWERCASE = False
VOCAB_SIZE = 30522

raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

hq_pers = pd.read_csv('/kaggle/input/persaude-corpus-2/persuade_2.0_human_scores_demo_id_github.csv')
hq_pers = hq_pers[hq_pers['holistic_essay_score'] > 4]
hq_pers.rename(columns={'full_text': 'text'}, inplace=True)
tokenizer_df = pd.concat([test, hq_pers])
dataset = Dataset.from_pandas(tokenizer_df[['text']])

def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]
        
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenizer.save_pretrained('persuade_tokenizer')

tokenized_texts_test = []
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []
for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))
    
def dummy(text):
    return text

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode')


vectorizer.fit(tokenized_texts_test)
vocab = vectorizer.vocabulary_

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

y_train = train['label'].values

if len(test.text.values) <= 5:
    sub.to_csv('submission_960.csv', index=False)
else:
    clf = MultinomialNB(alpha=0.02)
    sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
    p6={'n_iter': 1750,'verbose': -1,'objective': 'binary','metric': 'auc','learning_rate': 0.05073909898961407, 'colsample_bytree': 0.726023996436955, 'colsample_bynode': 0.5803681307354022, 'lambda_l1': 8.562963348932286, 'lambda_l2': 4.893256185259296, 'min_data_in_leaf': 115, 'max_depth': 23, 'max_bin': 898}
    lgb=LGBMClassifier(**p6)
    cat=CatBoostClassifier(iterations=1250,
                           verbose=0,
                           l2_leaf_reg=6.6591278779517808,
                           learning_rate=0.005689066836106983,
                           allow_const_label=True,loss_function = 'CrossEntropy')
    weights = [0.05,0.225,0.225,0.5]
 
    ensemble = VotingClassifier(estimators=[('mnb',clf),
                                            ('sgd', sgd_model),
                                            ('lgb',lgb), 
                                            ('cat', cat)
                                           ],
                                weights=weights, voting='soft', n_jobs=-1)
    ensemble.fit(tf_train, y_train)
    gc.collect()
    final_preds = ensemble.predict_proba(tf_test)[:,1]
    sub['generated'] = final_preds
    sub.to_csv('submission_960.csv', index=False)
    sub

  if _pandas_api.is_sparse(col):







  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/44865 [00:00<?, ?it/s]

In [7]:
import pandas as pd
import numpy as np

exp_name = 'exp222'
exps = ['exp200', 'exp184', 'exp222', 'exp179',]

df = pd.read_csv(f'submission_{exp_name}.csv', usecols=['id', 'text'])

for exp in exps:
    df_temp = pd.read_csv(f'submission_{exp}.csv')
    df_temp = df_temp[['id', 'preds']]
    df_temp.rename(columns={'preds': exp}, inplace=True)
    df = pd.merge(df, df_temp, on='id', how='left')
    
df['generated'] = df[exps].mean(axis=1)
df[['id', 'generated',]].to_csv('submission_936.csv', index=False)

In [8]:
import pandas as pd
import numpy as np

model_names = [
    '/kaggle/input/llm-daig-exp477',
    '/kaggle/input/llm-daig-exp478',
    
    '/kaggle/input/llm-daig-exp489',
    '/kaggle/input/llm-daig-exp492',
    '/kaggle/input/llm-daig-exp510',
    '/kaggle/input/llm-daig-exp512',
    
    '/kaggle/input/llm-daig-exp500', 
    
    '/kaggle/input/llm-daig-exp511', 
]
exps = [exp.split('-')[-1] for exp in model_names]

df = pd.read_csv(f'submission_{exps[0]}.csv', usecols=['id', 'text'])

for exp in exps:
    df_temp = pd.read_csv(f'submission_{exp}.csv')
    df_temp = df_temp[['id', 'preds']]
    
    df_temp.rename(columns={'preds': exp}, inplace=True)
    df = pd.merge(df, df_temp, on='id', how='left')
    
df['generated'] = df[['exp512', 'exp510', 'exp492', 'exp489']].mean(axis=1) * 0.8 + df[['exp511', 'exp500', 'exp478', 'exp477']].mean(axis=1) * 0.2
df[['id', 'generated',]].to_csv('submission_959.csv', index=False)

In [9]:
df

Unnamed: 0,id,text,exp477,exp478,exp489,exp492,exp510,exp512,exp500,exp511,generated
0,0000aaaa,Aaa bbb ccc .,0.999,0.999,0.07367,0.05225,0.09125,0.02225,0.1807,0.05792,0.159715
1,1111bbbb,Bbb ccc ddd .,0.9995,0.9995,0.047,0.02806,0.0484,0.005867,0.3333,0.12,0.14848
2,2222cccc,CCC ddd eee .,0.999,0.999,0.1395,0.04224,0.0904,0.01869,0.0964,0.00978,0.163375


In [10]:
import pandas as pd

sub960 = pd.read_csv('submission_960.csv')
sub936 = pd.read_csv('submission_936.csv')
sub959 = pd.read_csv('submission_959.csv')

df = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')

sub960.rename(columns={'generated': 'sub960'}, inplace=True)
sub936.rename(columns={'generated': 'sub936'}, inplace=True)
sub959.rename(columns={'generated': 'sub959'}, inplace=True)

df = pd.merge(df, sub960[['id', 'sub960']], on='id', how='left')
df = pd.merge(df, sub936[['id', 'sub936']], on='id', how='left')
df = pd.merge(df, sub959[['id', 'sub959']], on='id', how='left')

mask1 = (df['sub936'] > 0.1) & (df['sub936'] < 0.9)
mask2 = (df['sub936'] < 0.1) | (df['sub936'] > 0.9)

df.loc[mask1, 'generated'] = df.loc[mask1, 'sub936'] * 0.0 + df.loc[mask1, 'sub960'] * 1
df.loc[mask2, 'generated'] = df.loc[mask2, 'sub936'] * 0.3 + df.loc[mask2, 'sub960'] * 0.7

df['generated'] = df['generated'] * 0.85 + df['sub959'] * 0.15

In [11]:
df

Unnamed: 0,id,generated,sub960,sub936,sub959
0,0000aaaa,0.108957,0.1,0.467775,0.159715
1,1111bbbb,0.787272,0.9,0.2475,0.14848
2,2222cccc,0.364506,0.4,0.408725,0.163375


In [12]:
df = df[['id', 'generated']]
df.to_csv('submission_main.csv', index=False)

In [13]:
import numpy as np
import pandas as pd  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.cluster import KMeans
import umap

from transformers import AutoTokenizer

test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
preds = pd.read_csv('submission_main.csv')

test = pd.merge(test, preds, on='id', how='left')
tokenizer = AutoTokenizer.from_pretrained('persuade_tokenizer')

def dummy(text):
    return text
    

dfs = []
for prompt_id in test.prompt_id.unique():
    sub = test[test['prompt_id'] == prompt_id].copy()
    nat = sub[sub['generated'] < 0.2]
    gen = sub[sub['generated'] > 0.8]
    
    if sub.shape[0] < 1000 or nat.shape[0] < 500 or gen.shape[0] < 500:
        sub['mult'] = 1
        dfs.append(sub)
        continue
        
    tokenized_texts_test = []
    for text in tqdm(sub['text'].tolist()):
        tokenized_texts_test.append(tokenizer.tokenize(text))
    
    nat_tokenized = []
    for text in tqdm(nat['text'].tolist()):
        nat_tokenized.append(tokenizer.tokenize(text))
        
    gen_tokenized = []
    for text in tqdm(gen['text'].tolist()):
        gen_tokenized.append(tokenizer.tokenize(text))
        

    vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
        tokenizer = dummy,
        preprocessor = dummy,
        token_pattern = None, strip_accents='unicode')

    texts_tfidf = vectorizer.fit_transform(tokenized_texts_test)
    nat_tfidf = vectorizer.transform(nat_tokenized)
    gen_tfidf = vectorizer.transform(gen_tokenized)
        
    umap_vectorizer = umap.UMAP(random_state=2023, n_components=2).fit(texts_tfidf)
    embeddings = umap_vectorizer.transform(texts_tfidf)
    nat_embeddings = umap_vectorizer.transform(nat_tfidf)
    gen_embeddings = umap_vectorizer.transform(gen_tfidf)
    
    multipliers = []
    for emb in embeddings:
        k = 7

        nat_dist = np.sort(np.sum(np.square(emb - nat_embeddings), axis=1))
        gen_dist = np.sort(np.sum(np.square(emb - gen_embeddings), axis=1))

        if nat_dist[0] == 0:
            nat_dist = nat_dist[1:]
        else:
            gen_dist = gen_dist[1:]

        nat_dist = nat_dist[:k].mean()
        gen_dist = gen_dist[:k].mean()

        mult = nat_dist / (gen_dist+1e-5)
        mult = min(mult, 1.25)
        mult = max(mult, 0.75)

        multipliers.append(mult)
    
    sub['mult'] = multipliers
    dfs.append(sub)
    
test = pd.concat(dfs)

mask = (test['generated'] > 0.1) & (test['generated'] < 0.9)
test.loc[mask, 'generated'] = test.loc[mask, 'generated'] * test.loc[mask, 'mult']

In [14]:
test

Unnamed: 0,id,prompt_id,text,generated,mult
0,0000aaaa,2,Aaa bbb ccc.,0.108957,1
1,1111bbbb,3,Bbb ccc ddd.,0.787272,1
2,2222cccc,4,CCC ddd eee.,0.364506,1


In [15]:
test = test[['id', 'generated']]
test.to_csv('submission.csv', index=False)

In [16]:
test

Unnamed: 0,id,generated
0,0000aaaa,0.108957
1,1111bbbb,0.787272
2,2222cccc,0.364506
