In [1]:
from datasets import load_dataset
import torch

ds = load_dataset("aadityaubhat/GPT-wiki-intro", split="train")
ds = ds.rename_column("generated_intro", "fake")
ds = ds.rename_column("wiki_intro", "real")

ds = ds.train_test_split(test_size=0.33333)
ds = ds.with_format("torch")
ds_test = ds['test']
ds_train = ds['train']
ds

Using custom data configuration aadityaubhat--GPT-wiki-intro-10ad8b711a5f3880
Found cached dataset csv (C:/Users/andre/.cache/huggingface/datasets/aadityaubhat___csv/aadityaubhat--GPT-wiki-intro-10ad8b711a5f3880/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'real', 'fake', 'title_len', 'wiki_intro_len', 'generated_intro_len', 'prompt', 'generated_text', 'prompt_tokens', 'generated_text_tokens'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'real', 'fake', 'title_len', 'wiki_intro_len', 'generated_intro_len', 'prompt', 'generated_text', 'prompt_tokens', 'generated_text_tokens'],
        num_rows: 50000
    })
})

In [2]:
device = "cuda" if torch.cuda.is_available else "cpu"
print("Using device:", device)

Using device: cuda


In [19]:
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from transformers.pipelines.pt_utils import KeyDataset

tokenizer = AutoTokenizer.from_pretrained("roberta-large-openai-detector", max_length=512)
model = AutoModelForSequenceClassification.from_pretrained("roberta-large-openai-detector").to(device)
model.eval()

pipe = pipeline("text-classification", device=0, model=model, tokenizer=tokenizer)

labels = list(model.config.id2label.values())
true_neg = 0
false_pos = 0
dataset = ds['test'].select(range(1000))
print("Testing negatives...")
for out in tqdm(pipe(KeyDataset(dataset, "real"), batch_size=64, truncation=True), total=len(dataset)):
    if out['label'] == labels[0]:
        false_pos += 1
    else:
        true_neg += 1
print(f"Acc: {true_neg / (true_neg + false_pos)}\n")

true_pos = 0
false_neg = 0
print("Testing positives...")
for out in tqdm(pipe(KeyDataset(dataset, "fake"), batch_size=64, truncation=True), total=len(dataset)):
    if out['label'] == labels[0]:
        true_pos += 1
    else:
        false_neg += 1
print(f"Acc: {true_pos / (true_pos + false_neg)}")


Some weights of the model checkpoint at roberta-large-openai-detector were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Testing negatives...


100%|██████████| 1000/1000 [02:10<00:00,  7.68it/s]


Acc: 0.981

Testing positives...


100%|██████████| 1000/1000 [01:18<00:00, 12.76it/s]

Acc: 0.641





In [20]:
tot = true_pos+true_neg+false_pos+false_neg
print(f"""
Accuracy:           {(true_pos+true_neg) / (tot)}
Precision:          {true_pos/ (true_pos + false_pos)}
Recall:             {true_pos/ (true_pos + false_neg)}
n-samples:          {tot}

False positives:    {false_pos / tot}
False negatives:    {false_neg / tot}
""")


Accuracy:           0.811
Precision:          0.9712121212121212
Recall:             0.641
n-samples:          2000

False positives:    0.0095
False negatives:    0.1795



In [16]:
pipe("I like you. I love you")

[{'label': 'LABEL_0', 'score': 0.7373350858688354}]

In [None]:
def tokenized_length(text):
    return len(tokenizer(text, return_tensors="pt")['input_ids'][0])

longest = max(ds_test['real'], key=tokenized_length, )
tokens = tokenizer(longest, return_tensors="pt")['input_ids'][0]
print(len(longest))
print(len(tokens))

truncated_tokens = tokenizer(longest, truncation=True, return_tensors="pt")['input_ids'][0]
truncated_text = tokenizer.decode(truncated_tokens)
print(len(truncated_text))
print(len(longest))

3974
1465


In [None]:
tokenized_data = ds.map()

In [None]:
epochs = 1
lr = 0.002
variance_limit = 1e-10

