## Dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("metaeval/social-chemestry-101")
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [2]:
train_dataset = dataset.filter(lambda example: example['split'] == 'train')
test_dataset = dataset.filter(lambda example: example['split'] == 'test')
val_dataset = dataset.filter(lambda example: example['split'] == 'dev')

In [3]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 233501
    })
})

In [4]:
test_dataset

DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 29239
    })
})

In [5]:
val_dataset

DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 29234
    })
})

In [6]:
train_situations = []
test_situations = []
val_situations = []

for i in range(len(train_dataset['train'])):
    train_situations.append( "[SITUATION] " + train_dataset['train'][i]['situation'] + " [RULE] " + train_dataset['train'][i]['rot'])

for i in range(len(test_dataset['train'])):
    test_situations.append("[SITUATION] " + test_dataset['train'][i]['situation'] + " [RULE] " + test_dataset['train'][i]['rot'])

for i in range(len(val_dataset['train'])):
    val_situations.append("[SITUATION] " + val_dataset['train'][i]['situation'] + " [RULE] " + val_dataset['train'][i]['rot'])

In [7]:
"""Create the text files"""
with open("train_dataset.txt", "w") as file:
    for situation in train_situations:
        file.write(situation + "\n")

with open("test_dataset.txt", "w") as file:
    for situation in test_situations:
        file.write(situation + "\n")

with open("val_dataset.txt", "w") as file:
    for situation in val_situations:
        file.write(situation + "\n")

## GPT training

In [8]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [9]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="train_dataset.txt",
    block_size=128  
)

test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path="test_dataset.txt",
          block_size=128
)

val_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path="val_dataset.txt",
          block_size=128
)



In [10]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # MLM set to False for text generation tasks
)

In [11]:
training_args = TrainingArguments(
    output_dir="output_social_norms/gpt_generation",
    overwrite_output_dir=True, #overwrite the content of the output directory
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_strategy="steps",
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [13]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss
500,2.7704,2.281487
1000,2.6269,2.244105
1500,2.5819,2.227948
2000,2.5543,2.220978


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=2238, training_loss=2.624467537805798, metrics={'train_runtime': 2331.084, 'train_samples_per_second': 61.376, 'train_steps_per_second': 0.96, 'total_flos': 9345958723584000.0, 'train_loss': 2.624467537805798, 'epoch': 3.0})

In [16]:
from transformers import pipeline

generator = pipeline('text-generation', model=model.to("cpu"), tokenizer=tokenizer, config={'max_length':20})

result = generator("Telling my husband he should not buy his dream boat.")
result

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Telling my husband he should not buy his dream boat. It's rude to ask your family's permission for the wedding of something they asked them for.\nbeing upset at a very strong person who bullied me. It's normal for people in power"}]