# Simple baseline

Simple and very small (!) model trained on the real train dataset for this competition: 
https://www.kaggle.com/datasets/thedrcat/daigt-v2-train-dataset

**I'd appreciate if you can upvote both the dataset and the notebook if you find it helpful! 🙏❤️**

I'm planning to keep maintaining the train dataset with more data and to publish a proper instrumented training/inference notebooks with larger models (coming soon!). 

In [28]:
import transformers
import datasets
import pandas as pd
import numpy as np

In [29]:
model_checkpoint = "/kaggle/input/huggingfacedebertav3variants/deberta-v3-xsmall"

In [30]:
from datasets import Dataset

In [31]:
df = pd.read_csv('/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_01.csv')

In [32]:
train = df[df.prompt_name != 'Car-free cities'].reset_index(drop=True)
valid = df[df.prompt_name == 'Car-free cities'].reset_index(drop=True)
train.head()

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False


In [33]:
# Let's try to undersample the persuade_corpus:
not_persuade_df = train[train['source'] != 'persuade_corpus']
persuade_df = train[train['source'] == 'persuade_corpus']
sampled_persuade_df = persuade_df.sample(n=6000, random_state=42)

In [34]:
# Testing idea from discussion with @nbroad about limited characters in human essays
all_human = set(list(''.join(sampled_persuade_df.text.to_list())))
other = set(list(''.join(not_persuade_df.text.to_list())))

In [35]:
chars_to_remove = ''.join([x for x in other if x not in all_human])
print(chars_to_remove)

🍭🏯🙀🤝🏜择д📚手集🍽🇸🌴🎬中🌈а💥🛋📱🎸^👏🔬😎📞🏢💃👯🦐🥭🏡🎹💜🍰с🏰🤞必🇷应选😴🌃✨🎄🤛😨😱🐭响💆😘🔭🤫🏃力😝é🍣🍕🎓🍲📅🍷🚌🥕🕵🇺📖在🐕唯á🏫🌭🧦🐟🍖🧘🚫📉将🕰💚😭🏞🌿👬ç🍿注🥯🚴🇵💉🤢🕹🌫💇完😢🌲🐒🙌💪🌷🍳是👕🦄🛫¬🗣部🏟意机禁🔜ó🎅💫🌊🍜🥪🚗😜🇪🦁🐸💤👀🌻🏀🥨🍮🔧💨时♀😓🌽🎤📸🥩🌐💰💸🤟😠有🎊全‘📊取🛸🎮�🤤🧙😬🥖💬😹🎃💖╯😕🏻…👦о🧽🏦—💁🌨🥑💦​🐝🏼🚕–😄🎥🛠😆驶护🎢🙃💧📧😋🛬☀🤩🙏🔑🎩😁保🐳🛑‍📹🚚👨🍁💅🍓😮”🕺😈🤒🥲п者🌏😻📷🤓💯­🙊🌠路🥗🥦🦎💡👮🌌😲🐶🦸🧬🌄🐦🙄📝ā🎉🛍□🇫🎶😅🍞“🍟🧡🛣🥳🐻🎨💭🏄⚽🧐🏆🇧🛀🌸🥤🥟🧩


In [36]:
translation_table = str.maketrans('', '', chars_to_remove)
def remove_chars(s):
    return s.translate(translation_table)
not_persuade_df['text'] = not_persuade_df['text'].apply(remove_chars)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_persuade_df['text'] = not_persuade_df['text'].apply(remove_chars)


In [37]:
train = pd.concat([not_persuade_df, sampled_persuade_df]).sample(frac=1, random_state=42).reset_index(drop=True)
train.source.value_counts()

source
persuade_corpus                       6000
mistral7binstruct_v2                  2413
llama2_chat                           2409
mistral7binstruct_v1                  2406
original_moth                         2405
chat_gpt_moth                         2404
llama_70b_v1                          1138
falcon_180b_v1                        1031
train_essays                           979
darragh_claude_v7                      946
darragh_claude_v6                      941
mistralai/Mistral-7B-Instruct-v0.1     400
NousResearch/Llama-2-7b-chat-hf        400
cohere-command                         350
palm-text-bison1                       349
radek_500                              322
radekgpt4                              100
Name: count, dtype: int64

In [38]:
ds_train = Dataset.from_pandas(train)
ds_valid = Dataset.from_pandas(valid)

In [39]:
ds_train

Dataset({
    features: ['text', 'label', 'prompt_name', 'source', 'RDizzl3_seven'],
    num_rows: 24993
})

In [40]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [41]:
def preprocess_function(examples):
    return tokenizer(examples['text'], max_length=128, padding=True, truncation=True)

In [42]:
ds_train_enc = ds_train.map(preprocess_function, batched=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

In [43]:
ds_valid_enc = ds_valid.map(preprocess_function, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

In [44]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/huggingfacedebertav3variants/deberta-v3-xsmall and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
metric_name = "roc_auc"
model_name = "deberta-xsmall"
train_batch_size = 4
eval_batch_size = 32
grad_acc = 4

In [46]:
num_steps = len(train) // (train_batch_size * grad_acc)
num_steps

1562

In [47]:
args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "steps",
    save_strategy = "steps",
    eval_steps = num_steps // 3,
    save_steps = num_steps // 3,
    learning_rate=2e-5,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    gradient_accumulation_steps=grad_acc,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=False,
    metric_for_best_model=metric_name,
    report_to='none', # change to wandb after enabling internet access
)

In [48]:
from sklearn.metrics import roc_auc_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
    auc = roc_auc_score(labels, probs[:,1], multi_class='ovr')
    return {"roc_auc": auc}

In [49]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_train_enc,
    eval_dataset=ds_valid_enc,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [50]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Roc Auc
520,0.1191,0.459129,0.983985
1040,0.0491,0.14839,0.999789
1560,0.0442,0.299756,0.998314


TrainOutput(global_step=1562, training_loss=0.07003728268851697, metrics={'train_runtime': 429.4666, 'train_samples_per_second': 58.195, 'train_steps_per_second': 3.637, 'total_flos': 411587558621184.0, 'train_loss': 0.07003728268851697, 'epoch': 1.0})

In [51]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test_ds = Dataset.from_pandas(test)
test_ds_enc = test_ds.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [52]:
test_preds = trainer.predict(test_ds_enc)

In [53]:
logits = test_preds.predictions
probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
sub = pd.DataFrame()
sub['id'] = test['id']
sub['generated'] = probs[:,1]
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,id,generated
0,0000aaaa,0.022306
1,1111bbbb,0.996725
2,2222cccc,0.996295


In [54]:
res = []
for src in valid.source.unique():
    if src in ['train_essays', 'persuade_corpus', 'original_moth']: continue
    test2  = valid[valid['source'].isin([src, 'train_essays'])]
    test_ds2 = Dataset.from_pandas(test2)
    test_ds_enc2 = test_ds2.map(preprocess_function, batched=True)
    eval_result = trainer.evaluate(test_ds_enc2)
    score = eval_result['eval_roc_auc']
    res.append(f'{src}: {score}')
    
for r in res: print(r)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

darragh_claude_v7: 0.9933166248955723
darragh_claude_v6: 0.9930759101142688
radek_500: 0.9956140350877194
llama_70b_v1: 0.9932920536635707
falcon_180b_v1: 0.9928989139515455
chat_gpt_moth: 0.9952823234556981
mistral7binstruct_v2: 0.9915413533834586
llama2_chat: 0.9922723475355054
mistral7binstruct_v1: 0.9908103592314119
radekgpt4: 0.9947869674185463
