## Dataset

In [1]:
from datasets import load_dataset

original_dataset = load_dataset("metaeval/social-chemestry-101")
original_dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [2]:
dataset = original_dataset.select_columns(['split', 'situation-short-id', 'situation', 'rot-id', 'rot'])
dataset

DatasetDict({
    train: Dataset({
        features: ['split', 'situation-short-id', 'situation', 'rot-id', 'rot'],
        num_rows: 355922
    })
})

In [3]:
from datasets import DatasetDict

dataset = DatasetDict({
                    "train": dataset['train'].filter(lambda example: example['split'] == 'train'), 
                     "val": dataset['train'].filter(lambda example: example['split'] == 'dev'), 
                     "test": dataset['train'].filter(lambda example: example['split'] == 'test')
                }) 

dataset['train'] = dataset['train'].remove_columns(['split'])
dataset['val'] = dataset['val'].remove_columns(['split'])
dataset['test'] = dataset['test'].remove_columns(['split'])

dataset

DatasetDict({
    train: Dataset({
        features: ['situation-short-id', 'situation', 'rot-id', 'rot'],
        num_rows: 233501
    })
    val: Dataset({
        features: ['situation-short-id', 'situation', 'rot-id', 'rot'],
        num_rows: 29234
    })
    test: Dataset({
        features: ['situation-short-id', 'situation', 'rot-id', 'rot'],
        num_rows: 29239
    })
})

## GPT-2 training

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [5]:
import torch

print(torch.cuda.is_available())
device = torch.device('cuda:0')
model = model.to(device) 

True


In [6]:
tokenizer.add_special_tokens({'additional_special_tokens': ['<|SITUATION|>', '<|RULE|>']})
tokenizer.pad_token = tokenizer.eos_token

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # MLM set to False for text generation tasks
)

### Tokenizing the data

In [7]:
"""Tokenize the data"""
def tokenizing_data(example):
    encoding = tokenizer('<|SITUATION|> ' + example['situation'] + ' <|RULE|> ' + example['rot'], padding="max_length", truncation=True)
    return encoding

tokenized_dataset = dataset.map(tokenizing_data)
tokenized_dataset = tokenized_dataset.select_columns(['situation-short-id', 'rot-id', 'input_ids', 'attention_mask'])
tokenized_dataset

Map: 100%|██████████| 29239/29239 [00:23<00:00, 1230.97 examples/s]


DatasetDict({
    train: Dataset({
        features: ['situation-short-id', 'rot-id', 'input_ids', 'attention_mask'],
        num_rows: 233501
    })
    val: Dataset({
        features: ['situation-short-id', 'rot-id', 'input_ids', 'attention_mask'],
        num_rows: 29234
    })
    test: Dataset({
        features: ['situation-short-id', 'rot-id', 'input_ids', 'attention_mask'],
        num_rows: 29239
    })
})

### Training 

In [8]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 768)

In [9]:
training_args = TrainingArguments(
    output_dir="output_social_norms/gpt_generation",
    overwrite_output_dir=True, #overwrite the content of the output directory
    learning_rate=2e-5,
    auto_find_batch_size=True,
    # per_device_train_batch_size=8,
    # per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=10000,
    save_steps=10000,
    save_strategy="steps",
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [11]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


RuntimeError: No executable batch size found, reached zero.

In [16]:
from transformers import pipeline

generator = pipeline('text-generation', model=model.to("cpu"), tokenizer=tokenizer, config={'max_length':20})

result = generator("Telling my husband he should not buy his dream boat.")
result

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Telling my husband he should not buy his dream boat. It's rude to ask your family's permission for the wedding of something they asked them for.\nbeing upset at a very strong person who bullied me. It's normal for people in power"}]