In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

model_checkpoint = "/kaggle/input/roberta-large-argugpt-weights/RoBERTa-large-ArguGPT"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)



In [2]:
args = TrainingArguments(
    '.',
    per_device_eval_batch_size=1,
    report_to='none', # change to wandb while training with internet enabled
)

In [3]:
def preprocess_function(examples):
    return tokenizer(examples['text'], max_length=512, padding=False, truncation=True)

In [4]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
    auc = roc_auc_score(labels, probs[:,1], multi_class='ovr')
    return {"roc_auc": auc}

In [5]:
trainer = Trainer(
    model,
    args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [6]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test_ds = Dataset.from_pandas(test)
test_ds_enc = test_ds.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
test_preds = trainer.predict(test_ds_enc)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [8]:
logits = test_preds.predictions
probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
sub = pd.DataFrame()
sub['id'] = test['id']
sub['generated'] = probs[:,1]
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,id,generated
0,0000aaaa,0.998002
1,1111bbbb,0.997703
2,2222cccc,0.997144
