# Dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("metaeval/social-chemestry-101")
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [2]:
"""Distinct RoTs"""
distinct_rots = set()

def find_distinct_rots(example):
    distinct_rots.add(example['rot'])

dataset.map(find_distinct_rots)

Map: 100%|██████████| 355922/355922 [00:28<00:00, 12434.56 examples/s]


DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [3]:
distinct_rots = list(distinct_rots)
len(distinct_rots)

259614

In [4]:
"""Distinct situations"""
distinct_situations = set()

def find_distinct_situations(example):
    distinct_situations.add(example['situation'])

dataset.map(find_distinct_situations)

Map: 100%|██████████| 355922/355922 [00:27<00:00, 12716.64 examples/s]


DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [5]:
distinct_situations = list(distinct_situations)
len(distinct_situations)

103296

In [6]:
"""RoTs for each situation"""
rots_per_situation = {key:[] for key in distinct_situations}

def rots_for_each_situation(example):
    rots_per_situation[example['situation']].append(example['rot'])

dataset.map(rots_for_each_situation)

Map: 100%|██████████| 355922/355922 [00:32<00:00, 10900.68 examples/s]


DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [7]:
dataset = dataset.select_columns(['split', 'situation', 'rot'])

In [8]:
import random

def negative_examples(example):

    encoding = example

    negative_rot = None
    #sample a negative RoT for the current situation
    while negative_rot is None:
        candidate_rot = random.choice(distinct_rots)
        if candidate_rot not in rots_per_situation[example['situation']]:
            negative_rot = candidate_rot

    encoding['rot'] = negative_rot
    encoding['labels'] = 0

    return encoding

negative_dataset = dataset.map(negative_examples)

Map: 100%|██████████| 355922/355922 [00:19<00:00, 18316.38 examples/s]


In [9]:
def positive_examples(example):
    encoding = example
    encoding['labels'] = 1

    return encoding

positive_dataset = dataset.map(positive_examples)

In [10]:
from datasets import concatenate_datasets

pos_neg_dataset = concatenate_datasets([negative_dataset['train'], positive_dataset['train']])
pos_neg_dataset = pos_neg_dataset.shuffle(seed=42)
pos_neg_dataset

  table = cls._concat_blocks(blocks, axis=0)


Dataset({
    features: ['split', 'situation', 'rot', 'labels'],
    num_rows: 711844
})

In [11]:
from datasets import DatasetDict

pos_neg_dataset = DatasetDict({
                    "train": pos_neg_dataset.filter(lambda example: example['split'] == 'train'), 
                     "val": pos_neg_dataset.filter(lambda example: example['split'] == 'dev'), 
                     "test": pos_neg_dataset.filter(lambda example: example['split'] == 'test')
                     })
pos_neg_dataset

Filter: 100%|██████████| 711844/711844 [00:11<00:00, 63146.38 examples/s]
Filter: 100%|██████████| 711844/711844 [00:10<00:00, 65314.23 examples/s]
Filter: 100%|██████████| 711844/711844 [00:13<00:00, 53452.30 examples/s]


DatasetDict({
    train: Dataset({
        features: ['split', 'situation', 'rot', 'labels'],
        num_rows: 467002
    })
    val: Dataset({
        features: ['split', 'situation', 'rot', 'labels'],
        num_rows: 58468
    })
    test: Dataset({
        features: ['split', 'situation', 'rot', 'labels'],
        num_rows: 58478
    })
})

In [12]:
pos_neg_dataset['train'] = pos_neg_dataset['train'].remove_columns(['split'])
pos_neg_dataset['val'] = pos_neg_dataset['val'].remove_columns(['split'])
pos_neg_dataset['test'] = pos_neg_dataset['test'].remove_columns(['split'])
pos_neg_dataset

DatasetDict({
    train: Dataset({
        features: ['situation', 'rot', 'labels'],
        num_rows: 467002
    })
    val: Dataset({
        features: ['situation', 'rot', 'labels'],
        num_rows: 58468
    })
    test: Dataset({
        features: ['situation', 'rot', 'labels'],
        num_rows: 58478
    })
})

# TF_IDF tokenization

In [13]:
"""TF-IDF tokenization"""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [14]:
label_train = pos_neg_dataset['train'][:]['labels']
label_test = pos_neg_dataset['test'][:]['labels']
label_val = pos_neg_dataset['val'][:]['labels']

In [15]:
text_train = []
text_test = []
text_val = []

#TODO: Is this the right way to concatenate situations and RoTs?

for i in range(len(pos_neg_dataset['train'])):
    text_train.append(pos_neg_dataset['train'][i]['situation'] + ". "  + pos_neg_dataset['train'][i]['rot'])

for i in range(len(pos_neg_dataset['test'])):
    text_test.append(pos_neg_dataset['test'][i]['situation'] + ". "  + pos_neg_dataset['test'][i]['rot'])

for i in range(len(pos_neg_dataset['val'])):
    text_val.append(pos_neg_dataset['val'][i]['situation'] + ". "  + pos_neg_dataset['val'][i]['rot'])

# SVM classifier - (1-grams only)

In [28]:
"""Encode the text"""
tf_idf_vectorizer = TfidfVectorizer(ngram_range=(1, 1),) # 1-gram with words (only unigrams).
encoded_input_matrix = tf_idf_vectorizer.fit_transform(text_train) 
encoded_test_matrix = tf_idf_vectorizer.transform(text_test)
encoded_val_matrix = tf_idf_vectorizer.transform(text_val)

In [29]:
encoded_input_matrix.shape

(467002, 28710)

In [32]:
"""Initialize the classifier"""
svm_classifier = LinearSVC()

In [33]:
"""Train the classifier"""
svm_classifier.fit(encoded_input_matrix.toarray(), label_train)



In [34]:
"""Test the classifier"""
predictions = svm_classifier.predict(encoded_test_matrix.toarray())

In [39]:
"""Evaluate the model"""
import numpy as np
#accuracy
correct_answers = np.sum(np.equal(predictions, label_test))
accuracy = correct_answers / (len(predictions)) * 100

tp = 0 # true positives: predicted as positives (1) and the true label is also positive (1)
fn = 0 # false negatives: predicted as negatives (0) but the true label is positive (1)
fp = 0 # false positive: predicted as positives (1) but the true label is negative (0)

for i in range(len(predictions)):
    if predictions[i] == 1 and label_test[i] == 1:
        tp += 1
    elif predictions[i] == 0 and label_test[i] == 1:
        fn += 1
    elif predictions[i] == 1 and label_test[i] == 0:
        fp += 1

#recall
recall = tp / (tp + fn)

#precision
precision = tp / (tp + fp)

#f1-score
f1_score = 2 * precision * recall / (precision + recall)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-score: ", f1_score)

Accuracy:  58.84435172201512
Precision:  0.5897044540030526
Recall:  0.5814152330791067
F1-score:  0.5855305078616082


In [43]:
"""Evaluate the model"""
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

accuracy = accuracy_score(y_true=label_test, y_pred=predictions)
precision = precision_score(y_true=label_test, y_pred=predictions)
recall = recall_score(y_true=label_test, y_pred=predictions)
f1 = f1_score(y_true=label_test, y_pred=predictions)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 score", f1)

Accuracy:  0.5884435172201512
Precision:  0.5897044540030526
Recall:  0.5814152330791067
F1 score 0.5855305078616082


The reason of this poor result is that the tokenizer inteprets situations and RoTs as a single sentence, and does understand any correlation between them. 

# SVM classifier - (1-grams and 2-grams) -> The encoded vectors are too large!

In [23]:
"""Encode the text"""
tf_idf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),) # unigrams and digrams
encoded_input_matrix = tf_idf_vectorizer.fit_transform(text_train) 
encoded_test_matrix = tf_idf_vectorizer.transform(text_test)
encoded_val_matrix = tf_idf_vectorizer.transform(text_val)

In [25]:
encoded_input_matrix.shape

(467002, 482460)

In [26]:
"""Initialize the classifier"""
svm_classifier = LinearSVC()

In [27]:
"""Train the classifier"""
svm_classifier.fit(encoded_input_matrix.toarray(), label_train)

MemoryError: Unable to allocate 1.64 TiB for an array with shape (467002, 482460) and data type float64

In [None]:
"""Test the classifier"""
predictions = svm_classifier.predict(encoded_test_matrix.toarray())

In [None]:
"""Evaluate the model"""
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

accuracy = accuracy_score(y_true=label_test, y_pred=predictions)
precision = precision_score(y_true=label_test, y_pred=predictions)
recall = recall_score(y_true=label_test, y_pred=predictions)
f1 = f1_score(y_true=label_test, y_pred=predictions)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 score", f1)