## Imports

In [1]:
from datasets import load_dataset
import numpy as np
from sklearn.metrics import f1_score
from transformers import AutoTokenizer,AutoModelForSequenceClassification,TrainingArguments, Trainer

In [2]:
import torch

In [3]:
torch.cuda.is_available()

True

In [4]:
dataset = load_dataset("allenai/prosocial-dialog")


Using custom data configuration allenai--prosocial-dialog-ebbad39ca08b6d44
Found cached dataset json (/home/shahul/.cache/huggingface/datasets/allenai___json/allenai--prosocial-dialog-ebbad39ca08b6d44/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
MODEL = 'roberta-base'

In [6]:
label_to_id = {"__casual__":0,"__needs_caution__":1,"__needs_intervention__":2,"__probably_needs_caution__":3,"__possibly_needs_caution__":4}

In [7]:
MAXLEN = 128
from torch.utils.data import Dataset
class ProSocialDataset(Dataset):
    
    def __init__(self,split):
        
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL)
        self.sep_token  = self.tokenizer.sep_token
        self.dataset = dataset[split]
        self.label2id = label_to_id
        self.id2label = {v:k for k,v in label_to_id.items()}
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self,idx):
        
        context = self.dataset[idx]
        idx_start = idx 
        end = self.dataset[max(0,idx_start-1)]["episode_done"]
        while ((not end) and (idx_start>0)):
            end = self.dataset[max(0,idx_start-2)]["episode_done"]
            idx_start -= 1
        idx_start = max(0,idx_start)
        
        prev_context = [f'{self.dataset[i]["context"]}' for i in range(idx_start,idx)]
        rots = self.dataset[idx]["rots"]
        context = f'{self.dataset[idx]["context"]}' + self.sep_token + "".join(prev_context) + self.sep_token + "".join(rots)
        
        encoding = self.tokenizer(
                    context,
                    max_length=MAXLEN, 
                    add_special_tokens=True,
                    truncation=True, 
                    padding='max_length')
        
        encoding["labels"] = self.label2id[self.dataset[idx]["safety_label"]]
        
        return encoding
        
        
        

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL,num_labels=len(label_to_id))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [10]:
train_dataset = ProSocialDataset("train")
eval_dataset = ProSocialDataset("validation")

Loading cached shuffled indices for dataset at /home/shahul/.cache/huggingface/datasets/allenai___json/allenai--prosocial-dialog-ebbad39ca08b6d44/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-6988ddb0c4ced9f9.arrow
Loading cached shuffled indices for dataset at /home/shahul/.cache/huggingface/datasets/allenai___json/allenai--prosocial-dialog-ebbad39ca08b6d44/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-bf21e449abb2f575.arrow


In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"f1":f1_score(labels,predictions,average="micro")}

In [13]:
# Training
training_args = TrainingArguments(output_dir="test_trainer",
                                  overwrite_output_dir = True,
                                  per_device_train_batch_size = 128,
                                  per_device_eval_batch_size= 128,
                                  learning_rate=3e-5,
                                  weight_decay = 0.01,
                                  evaluation_strategy="epoch",
                                  #lr_scheduler_type="cosine",
                                  num_train_epochs=5,
                                  load_best_model_at_end=True,
                                  save_strategy="epoch")
trainer_bert = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [14]:
# trainer_bert.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: safety_label, dialogue_id, rots, response, source, response_id, safety_annotation_reasons, etc, safety_annotations, episode_done, context. If safety_label, dialogue_id, rots, response, source, response_id, safety_annotation_reasons, etc, safety_annotations, episode_done, context are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375
  Number of trainable parameters = 124649477


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.486423,0.423
2,No log,1.484509,0.423
3,No log,1.485051,0.423


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: safety_label, dialogue_id, rots, response, source, response_id, safety_annotation_reasons, etc, safety_annotations, episode_done, context. If safety_label, dialogue_id, rots, response, source, response_id, safety_annotation_reasons, etc, safety_annotations, episode_done, context are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: safety_label, dialogue_id, rots, response, source, response_id, safety_annotation_reasons, etc, safety_annotations, episode_done, context. If safety_label, dialogue_id, rots, response, source, response_id, safety_annotation_reasons, etc, safety_annota

TrainOutput(global_step=375, training_loss=1.4957096354166666, metrics={'train_runtime': 167.936, 'train_samples_per_second': 17.864, 'train_steps_per_second': 2.233, 'total_flos': 197338606848000.0, 'train_loss': 1.4957096354166666, 'epoch': 3.0})

In [17]:
model = AutoModelForSequenceClassification.from_pretrained("safety-cls")

loading configuration file safety-cls/config.json
Model config RobertaConfig {
  "_name_or_path": "safety-cls",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "us

In [15]:
model.push_to_hub("shahules786/prosocial-classifier")

Saving model checkpoint to safety-cls
Configuration saved in safety-cls/config.json
Model weights saved in safety-cls/pytorch_model.bin
