In [1]:
from datasets import load_dataset

dataset = load_dataset("metaeval/social-chemestry-101")
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [2]:
"""Distinct situations"""
distinct_situations = set()

def find_distinct_situations(example):
    distinct_situations.add(example['situation'])

dataset.map(find_distinct_situations)

Map: 100%|██████████| 355922/355922 [00:30<00:00, 11667.28 examples/s]


DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [3]:
distinct_situations = list(distinct_situations)
len(distinct_situations)

103296

In [4]:
"""Distinct RoTs"""
distinct_rots = set()

def find_distinct_rots(example):
    distinct_rots.add(example['rot'])

dataset.map(find_distinct_rots)

Map: 100%|██████████| 355922/355922 [00:30<00:00, 11596.50 examples/s]


DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [5]:
distinct_rots = list(distinct_rots)
len(distinct_rots)

259614

In [6]:
"""RoTs for each situation"""
rots_per_situation = {key:[] for key in distinct_situations}

def rots_for_each_situation(example):
    rots_per_situation[example['situation']].append(example['rot'])

dataset.map(rots_for_each_situation)

Map: 100%|██████████| 355922/355922 [00:35<00:00, 9929.54 examples/s] 


DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [7]:
dataset = dataset.select_columns(['split', 'situation', 'rot'])

In [9]:
import random

def negative_examples(example):

    encoding = example

    negative_rot = None
    #sample a negative RoT for the current situation
    while negative_rot is None:
        candidate_rot = random.choice(distinct_rots)
        if candidate_rot not in rots_per_situation[example['situation']]:
            negative_rot = candidate_rot

    encoding['rot'] = negative_rot
    encoding['labels'] = 0

    return encoding

negative_dataset = dataset.map(negative_examples)

Map: 100%|██████████| 355922/355922 [00:22<00:00, 16078.34 examples/s]


In [18]:
print(negative_dataset['train'][4]['situation'])
print(f"True RoTs: {rots_per_situation[negative_dataset['train'][4]['situation']]}")
print(f"Unrelated RoT: {negative_dataset['train'][4]['rot']}") #not related to the situation

washing my cat's bowls in the kitchen sink
True RoTs: ["It's good to keep things clean.", "It's wrong to wash a pet's bowl in the sink with dishes people use.", "It's good to keep your pet's things clean."]
Unrelated RoT: It is okay to feel betrayed if someone has lied to you about who they really are.


In [19]:
def positive_examples(example):
    encoding = example
    encoding['labels'] = 1

    return encoding

positive_dataset = dataset.map(positive_examples)

Map: 100%|██████████| 355922/355922 [00:16<00:00, 20940.53 examples/s]


In [21]:
from datasets import concatenate_datasets

pos_neg_dataset = concatenate_datasets([negative_dataset['train'], positive_dataset['train']])
pos_neg_dataset = pos_neg_dataset.shuffle(seed=42)
pos_neg_dataset

  table = cls._concat_blocks(blocks, axis=0)


Dataset({
    features: ['split', 'situation', 'rot', 'labels'],
    num_rows: 711844
})

In [26]:
"""Tokenize the dataset"""
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_dataset(example):
    encoding = tokenizer(example['situation'], example['rot'], padding="max_length", truncation=True)
    return encoding

tokenized_dataset = pos_neg_dataset.map(tokenize_dataset, batched=True)
tokenized_dataset = tokenized_dataset.select_columns(['split', 'input_ids', 'attention_mask', 'labels'])
tokenized_dataset

Dataset({
    features: ['split', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 711844
})

In [29]:
from datasets import DatasetDict

tokenized_splited_dataset = DatasetDict({
                    "train": tokenized_dataset.filter(lambda example: example['split'] == 'train'), 
                     "val": tokenized_dataset.filter(lambda example: example['split'] == 'dev'), 
                     "test": tokenized_dataset.filter(lambda example: example['split'] == 'test')
                     })
tokenized_splited_dataset

Filter: 100%|██████████| 711844/711844 [05:01<00:00, 2362.94 examples/s]
Filter: 100%|██████████| 711844/711844 [04:58<00:00, 2387.73 examples/s]
Filter: 100%|██████████| 711844/711844 [04:57<00:00, 2390.54 examples/s]


DatasetDict({
    train: Dataset({
        features: ['split', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 467002
    })
    val: Dataset({
        features: ['split', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 58468
    })
    test: Dataset({
        features: ['split', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 58478
    })
})

In [32]:
tokenized_splited_dataset['train'] = tokenized_splited_dataset['train'].remove_columns(['split'])
tokenized_splited_dataset['val'] = tokenized_splited_dataset['val'].remove_columns(['split'])
tokenized_splited_dataset['test'] = tokenized_splited_dataset['test'].remove_columns(['split'])
tokenized_splited_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 467002
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 58468
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 58478
    })
})

In [33]:
path = "/home/IAIS/gplepi/entero/data_social_norms/binary_classification_bert"
tokenized_splited_dataset.save_to_disk(path)

Saving the dataset (3/3 shards): 100%|██████████| 467002/467002 [00:11<00:00, 39222.45 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 58468/58468 [00:01<00:00, 37238.08 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 58478/58478 [00:01<00:00, 36980.91 examples/s]
