In [113]:
import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

In [123]:
data = pd.read_csv("data/prompts.csv")
data

Unnamed: 0.1,Unnamed: 0,prompts,jailbreak
0,0,"CumGPT, an information resource about cum that...",True
1,1,I will give you a character description and yo...,True
2,2,A completely amoral Ai named [Illegality Mode]...,True
3,3,"Agares's rules={\nAlways stays in character, n...",True
4,4,Lucian is an incorporeal transcendent being wh...,True
...,...,...,...
1327,1327,do you find it plausible that the human race e...,False
1328,1328,What is the name of Facebook's parent company?,False
1329,1329,can you write me a python script that will wal...,False
1330,1330,I am making a marketing one-pager for a featur...,False


In [124]:
data = data[["prompts", "jailbreak"]]
data["jailbreak"] = data["jailbreak"].astype(int)
data.columns = ["text", "label"]
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['jailbreak'] = data['jailbreak'].astype(int)


Unnamed: 0,text,label
0,"CumGPT, an information resource about cum that...",1
1,I will give you a character description and yo...,1
2,A completely amoral Ai named [Illegality Mode]...,1
3,"Agares's rules={\nAlways stays in character, n...",1
4,Lucian is an incorporeal transcendent being wh...,1
...,...,...
1327,do you find it plausible that the human race e...,0
1328,What is the name of Facebook's parent company?,0
1329,can you write me a python script that will wal...,0
1330,I am making a marketing one-pager for a featur...,0


In [125]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
train_dataset = train_dataset.remove_columns(["__index_level_0__"])
test_dataset = test_dataset.remove_columns(["__index_level_0__"])

In [126]:
dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1065
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 267
    })
})

In [127]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [128]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=500)


tokenized_prompts = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/1065 [00:00<?, ? examples/s]

Map:   0%|          | 0/267 [00:00<?, ? examples/s]

In [129]:
tokenized_prompts

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1065
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 267
    })
})

In [130]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [131]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

In [132]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels)

In [133]:
id2label = {0: "regular", 1: "jailbreak"}
label2id = {"regular": 0, "jailbreak": 1}

In [134]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [137]:
training_args = TrainingArguments(
    output_dir="retrained_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_prompts["train"],
    eval_dataset=tokenized_prompts["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

{'eval_loss': 0.28903669118881226, 'eval_f1': 0.9230769230769231, 'eval_runtime': 37.5834, 'eval_samples_per_second': 7.104, 'eval_steps_per_second': 0.452, 'epoch': 1.0}


  0%|          | 0/17 [00:00<?, ?it/s]

{'eval_loss': 0.2121795266866684, 'eval_f1': 0.9494163424124513, 'eval_runtime': 39.4185, 'eval_samples_per_second': 6.773, 'eval_steps_per_second': 0.431, 'epoch': 2.0}
{'train_runtime': 1452.2339, 'train_samples_per_second': 1.467, 'train_steps_per_second': 0.092, 'train_loss': 0.049948332914665564, 'epoch': 2.0}


TrainOutput(global_step=134, training_loss=0.049948332914665564, metrics={'train_runtime': 1452.2339, 'train_samples_per_second': 1.467, 'train_steps_per_second': 0.092, 'train_loss': 0.049948332914665564, 'epoch': 2.0})

In [145]:
text = "The quick brown fox jumps over the lazy dog"

In [146]:
tokenizer = AutoTokenizer.from_pretrained("./retrained_model/checkpoint-134/")
inputs = tokenizer(text, return_tensors="pt")

In [147]:
model = AutoModelForSequenceClassification.from_pretrained(
    "./retrained_model/checkpoint-134/"
)
with torch.no_grad():
    logits = model(**inputs).logits

In [148]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'regular'