## Dataset

In [25]:
from datasets import load_dataset

dataset = load_dataset("metaeval/social-chemestry-101")

In [26]:
dataset

DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [33]:
dataset['train'][10]

{'area': 'amitheasshole',
 'm': 1,
 'split': 'train',
 'rot-agree': 3.0,
 'rot-categorization': 'morality-ethics',
 'rot-moral-foundations': 'care-harm|loyalty-betrayal',
 'rot-char-targeting': 'char-1',
 'rot-bad': 0,
 'rot-judgment': "it's okay",
 'action': 'feeling angry when you find out your roommate lied to you',
 'action-agency': 'experience',
 'action-moral-judgment': None,
 'action-agree': 3.0,
 'action-legal': None,
 'action-pressure': None,
 'action-char-involved': None,
 'action-hypothetical': None,
 'situation': "telling my roommate with 2 months notice that I'm going to move out, then after he bought a house to live in, change my mind and getting a new roommate",
 'situation-short-id': 'reddit/amitheasshole/adwxny',
 'rot': "If you find out that your roommate lied to you, it's okay to feel angry.",
 'rot-id': 'rot/reddit/amitheasshole/adwxny/3DH6GAKTY14IXU2D5WHPXGPIFGTZYG/129/4',
 'rot-worker-id': 129,
 'breakdown-worker-id': 17,
 'n-characters': 3,
 'characters': 'narrat

'rot' = 'rot-judgement' + 'action'.  1–5 RoTs for each 'situation'    (see https://github.com/mbforbes/social-chemistry-101)

##### RoT attributes:
- rot
- rot-agree: asks how many people probably agree with the RoT as stated. How universally held is the rule-of-thumb as a belief?
    - 0: almost no one
    - 1: uncommon
    - 2: contreversial
    - 3: common
    - 4: universal
- rot-categorization: the category of the rule of thumb. (15 in total)
- rot-moral-foundations: define fundamental axes of morality. (5 in total)
- rot-char-targeting: The relevant character who in the situation is the person to who you would tell this rule-of-thumb. e.g. char-1 = my roommate (characters[1])
- rot-bad: 0 or 1, 98% of elements have the value 0
- rot-judgement: the judgement of the action

##### Action attributes:
- action
- action-agency: agency or experience (is designed to let workers distinguish RoTs that involve agentive action from those that indicate an an experience)
- action-moral-judgment: This is an intuitive reaction of whether something is good or bad.
    - -2: very bad
    - -1: bad
    - 0: expected/OK
    - 1: good
    - 2: very good
- action-agree: what portion of people probably agree with the judgment given the action
    - 0: rare (< 1%)
    - 1: few (5-25%)
    - 2: controversial (~50%)
    - 3: most (75-90%)
    - 4: all (> 99%)
- action-legal: corresponds to prescriptive norms: what one ought to do (legal, tolerated, illegal, null)
- action-pressure: cultural pressure, measures to what degree someone feels socially influenced to do (or avoid) an action.
    - -2: strongly-against
    - -1: against
    - 0: discretionary
    - 1: for
    - 2: strongly for
- action-char-involved: 
- action-hypothetical: indicates whether the candidate character is explicitly taking the action in the situation: explicit-no, probable-no, hypothetical, probable, explicit

In [27]:
train_dataset = dataset.filter(lambda example: example['split'] == 'train')
train_dataset

DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 233501
    })
})

In [28]:
test_dataset = dataset.filter(lambda example: example['split'] == 'test')
test_dataset

DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 29239
    })
})

# Binary classification BERT

### Load BERT dataset

In [1]:
import datasets

path = "/home/IAIS/gplepi/entero/data_social_norms/binary_classification_bert"
bert_dataset = datasets.load_from_disk(path)
bert_dataset

  from .autonotebook import tqdm as notebook_tqdm
  table = cls._concat_blocks(blocks, axis=0)


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 467002
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 58468
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 58478
    })
})

## Training

In [2]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [3]:
id2label = {0: "NOT-APPLIED", 1: "APPLIED"}

label2id = {"NOT-APPLIED": 0, "APPLIED": 1}

In [4]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
import torch

print(torch.cuda.is_available())
device = torch.device('cuda:0')
model = model.to(device) 

True


In [6]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="output_social_norms/bert_binary_classification",
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=3000,
    save_steps=3000,
    save_strategy="steps",
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [7]:
from transformers import DataCollatorWithPadding
import evaluate

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    accuracy_score = accuracy.compute(predictions=predictions, references=labels)
    precision_score = precision.compute(predictions=predictions, references=labels)
    recall_score = recall.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels)
    
    return {
        "precision": precision_score['precision'],
        "recall": recall_score['recall'],
        "f1": f1_score['f1'],
        "accuracy": accuracy_score['accuracy'],
    }

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=bert_dataset['train'],
    eval_dataset=bert_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [9]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
3000,0.2406,0.221082,0.933387,0.877608,0.904638,0.907488
6000,0.2126,0.193477,0.911729,0.929568,0.920562,0.919785
9000,0.1595,0.186887,0.9191,0.929192,0.924118,0.923702
12000,0.1537,0.188436,0.937261,0.912157,0.924538,0.925549




## Load model and evaluate

In [15]:
from transformers import AutoModelForSequenceClassification

finetuned_model = AutoModelForSequenceClassification.from_pretrained("/home/IAIS/gplepi/entero/output_social_norms/bert_binary_classification/checkpoint-21000")

In [16]:
finetuned_model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [17]:
trainer = Trainer(
    model=finetuned_model, #give here as a parameter the finetuned model
    args=training_args,
    train_dataset=bert_dataset['train'],
    eval_dataset=bert_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [18]:
predictions_output = trainer.predict(bert_dataset['test'])
print(predictions_output.metrics)

{'test_loss': 0.20166319608688354, 'test_precision': 0.9313823377517645, 'test_recall': 0.9252026403091761, 'test_f1': 0.9282822043785602, 'test_accuracy': 0.9285201272273333, 'test_runtime': 476.6797, 'test_samples_per_second': 122.678, 'test_steps_per_second': 1.917}


## Inference

In [45]:
from transformers import AutoTokenizer, pipeline

situation = "Wanting to leave my country"
rot_pos = "It is fine to do what is best for you"
rot_neg = "It is bad to not call your mother"
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=finetuned_model.to("cpu"), tokenizer=tokenizer)
answer = clf(situation + " " + tokenizer.sep_token + " " + rot_pos)
answer

[{'label': 'APPLIED', 'score': 0.8717881441116333}]

In [46]:
answer = clf(situation + " " + tokenizer.sep_token + " " + rot_neg)
answer

[{'label': 'NOT-APPLIED', 'score': 0.9997960925102234}]