In [5]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1,2,3"

# Dataset

In [6]:
import pandas as pd

In [7]:
social_comments = pd.read_pickle('/home/IAIS/gplepi/entero/data_social_norms/social_comments_filtered.gzip', compression='gzip')
social_comments

Unnamed: 0,id,permalink,label,body,parent_id,author_fullname,author_name
13,eamexog,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,Uh absolutely NTA. These are really really hor...,a1311q,t2_2kabg9z7,xormun
14,eameha5,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,"NTA. Ok sweetie no, hell no this is not your f...",a1311q,t2_1jrodkow,tkPuncake
16,eamjnog,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,NTA. My girlfriend has hypothyroidism and i kn...,a1311q,t2_14ub01,hawkbearpig
17,ef5kbsb,/r/AmItheAsshole/comments/akkcpn/aita_for_not_...,NTA,You're clearly NTA. Sorry about your homophobi...,akkcpn,t2_61b3s,sadsquash
20,ef5l208,/r/AmItheAsshole/comments/akkcpn/aita_for_not_...,NTA,"NTA. And it will get better, I promise. You'll...",akkcpn,t2_xvrsh,SheketBevakaSTFU
...,...,...,...,...,...,...,...
530631,ei9ofvo,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,YTA,NAH (a bit towards yta) OP. You found the wors...,azofrl,t2_16fctm,xAlois
530636,ei9gkon,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,"NTA &#x200B; but you handled it really poorly,...",azofrl,t2_dk4gojr,YoungDiscord
530637,ei9gl79,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,NTA how are you the asshole? For like bigger b...,azofrl,t2_2xfoz1fv,Dark-_-Legacy
530639,ei9gmpk,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,NTA. Your girlfriend is overreacting. You don'...,azofrl,t2_15bdqt5w,Broken_Angel-


In [8]:
import re
from typing import Any
import string

"""
Text preprocessing: lowercase; remove punctuation; remove NTA_KEYWORDS, YTA_KEYWORDS and 'ampx200b', 'x200b', 'AITA', 'aita'
"""

class KeywordsCleaner:
    def __init__(self) -> None:
        # NTA YTA keywords
        NTA_KEYWORDS = ['nta', 'nah', 'you are not the asshole', 'you\'re not the asshole', 'u are not the asshole', 'u re not the asshole', 
                        'you re not the asshole', 'u\'re not the asshole', 'not the asshole', 'not the ah', 'not asshole', 'not ah']
        YTA_KEYWORDS = ['yta', 'you are the asshole', 'you\'re the asshole', 'u are the asshole', 'u re the asshole', 
                        'you re the asshole', 'u\'re the asshole', 'you the ah', 'you the asshole', 'u the asshole', 'u the ah']

        keywords_rep = {'ampx200b': "", 'x200b': "", 'AITA': "", 'aita': ""}
        
        for key in NTA_KEYWORDS + YTA_KEYWORDS:
            keywords_rep[key] = ""
        keywords_rep = dict(sorted(keywords_rep.items(), key=lambda k: len(k[0]), reverse=True))

        self.rep = dict((re.escape(k), v) for k, v in keywords_rep.items())
        self.pattern = re.compile("|".join(self.rep.keys()))

    def __call__(self, text: str) -> str:
        text = self.pattern.sub(lambda m: self.rep[re.escape(m.group(0))], text.lower())
        return text.translate(str.maketrans('', '', string.punctuation))   


In [9]:
"""Example"""
keywordsCleaner = KeywordsCleaner()

print(social_comments["body"].at[530636])
keywordsCleaner(social_comments["body"].at[530636])

NTA &#x200B; but you handled it really poorly, like that was the absolute worst way in which you could have ever said it &#x200B; why not acknowledge that she's beautiful and sexy in her own way? make a lateral move that you know, wouldn't require you to directly say: I'm not attracted to you physically? because that would just open up a can of worms. &#x200B; well, you messed it up so now you have to fix it. &#x200B; You're not an asshole for having a personal body type preference, everyone has but you are an idiot for handling it the way you did, good luck with that.


'  but you handled it really poorly like that was the absolute worst way in which you could have ever said it  why not acknowledge that shes beautiful and sexy in her own way make a lateral move that you know wouldnt require you to directly say im not attracted to you physically because that would just open up a can of worms  well you messed it up so now you have to fix it  youre not an asshole for having a personal body type preference everyone has but you are an idiot for handling it the way you did good luck with that'

In [10]:
"""Filter social comments"""
keywordsCleaner = KeywordsCleaner()

for i, row in social_comments.iterrows():
    row['body'] = keywordsCleaner(row['body'])

social_comments

Unnamed: 0,id,permalink,label,body,parent_id,author_fullname,author_name
13,eamexog,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,uh absolutely these are really really horrid ...,a1311q,t2_2kabg9z7,xormun
14,eameha5,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,ok sweetie no hell no this is not your fault ...,a1311q,t2_1jrodkow,tkPuncake
16,eamjnog,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,my girlfriend has hypothyroidism and i know t...,a1311q,t2_14ub01,hawkbearpig
17,ef5kbsb,/r/AmItheAsshole/comments/akkcpn/aita_for_not_...,NTA,youre clearly sorry about your homophobic fam...,akkcpn,t2_61b3s,sadsquash
20,ef5l208,/r/AmItheAsshole/comments/akkcpn/aita_for_not_...,NTA,and it will get better i promise youll make i...,akkcpn,t2_xvrsh,SheketBevakaSTFU
...,...,...,...,...,...,...,...
530631,ei9ofvo,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,YTA,a bit towards op you found the worst way to ...,azofrl,t2_16fctm,xAlois
530636,ei9gkon,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,but you handled it really poorly like that w...,azofrl,t2_dk4gojr,YoungDiscord
530637,ei9gl79,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,how are for like bigger boobs,azofrl,t2_2xfoz1fv,Dark-_-Legacy
530639,ei9gmpk,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,your girlfriend is overreacting you dont have...,azofrl,t2_15bdqt5w,Broken_Angel-


In [11]:
from datasets import Dataset

dataset = Dataset.from_pandas(social_comments)
dataset

Dataset({
    features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__'],
    num_rows: 212687
})

In [12]:
from datasets import DatasetDict

"""80-10-10 split"""
# 80% train, 20% test + validation
train_testvalid = dataset.train_test_split(test_size=0.2)
# Split the 20% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'val': test_valid['train']})

train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__'],
        num_rows: 21269
    })
})

In [13]:
id2label = {0: "NTA", 1: "YTA"}

label2id = {"NTA": 0, "YTA": 1}

# Dataset for modeling p(y|c) 

In [14]:
from transformers import AutoTokenizer

"""Tokenize the data"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data(example):
    encoding = tokenizer(example['body'], padding="max_length", truncation=True)
    encoding['labels'] = label2id[ example['label'] ]

    return encoding

tokenized_dataset = train_test_valid_dataset.map(tokenize_data)
tokenized_dataset

Map: 100%|██████████| 170149/170149 [02:17<00:00, 1239.34 examples/s]
Map: 100%|██████████| 21269/21269 [00:14<00:00, 1455.35 examples/s]
Map: 100%|██████████| 21269/21269 [00:14<00:00, 1436.40 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
})

In [15]:
tokenized_dataset = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
})

In [16]:
tokenized_dataset['train'][:10]['labels']

[1, 1, 0, 0, 0, 0, 0, 1, 1, 1]

# BERT model for p(y|c)

In [21]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="output_social_norms/bert_comments_classification_5_epochs",
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_strategy="steps",
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [23]:
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    accuracy_score = accuracy.compute(predictions=predictions, references=labels)
    precision_score = precision.compute(predictions=predictions, references=labels)
    recall_score = recall.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels)
    
    return {
        "precision": precision_score['precision'],
        "recall": recall_score['recall'],
        "f1": f1_score['f1'],
        "accuracy": accuracy_score['accuracy'],
    }

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [25]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.4196,0.394401,0.710412,0.671732,0.69053,0.821148
2000,0.3463,0.363715,0.827448,0.619342,0.708428,0.848559
3000,0.3115,0.337919,0.791887,0.685977,0.735137,0.853167
4000,0.2582,0.341485,0.762723,0.744856,0.753684,0.855376
5000,0.2498,0.333438,0.797051,0.718582,0.755785,0.862053
6000,0.21,0.346393,0.788708,0.729661,0.758037,0.86163
7000,0.2047,0.346428,0.8018,0.719057,0.758178,0.863745
8000,0.1793,0.376757,0.784205,0.740266,0.761602,0.862335




TrainOutput(global_step=8865, training_loss=0.27517285634754607, metrics={'train_runtime': 14911.3986, 'train_samples_per_second': 57.053, 'train_steps_per_second': 0.595, 'total_flos': 1.1269597706959872e+17, 'train_loss': 0.27517285634754607, 'epoch': 5.0})

In [26]:
trainer.save_model()

## Inference

In [27]:
from transformers import AutoTokenizer, pipeline

nta_comment = "If it's for your best, than do not worry about it."
yta_comment = "You shouldn't have done that, it's not allowed."
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model='/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_5_epochs', tokenizer=tokenizer)

answer = clf(nta_comment)
print(f"{nta_comment} -> {answer}")

answer = clf(yta_comment)
print(f"{yta_comment} -> {answer}")

If it's for your best, than do not worry about it. -> [{'label': 'NTA', 'score': 0.9986791014671326}]
You shouldn't have done that, it's not allowed. -> [{'label': 'YTA', 'score': 0.5555195212364197}]


# Dataset for modeling p(y|c,s) 

In [1]:
"""Load the dataset of situations"""
from datasets import load_dataset

situations_dataset = load_dataset("metaeval/social-chemestry-101")
situations_dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [4]:
situations_dataset['train'].add_faiss_index(column='situation-short-id')

ImportError: libmkl_intel_lp64.so.1: cannot open shared object file: No such file or directory

In [38]:
import numpy as np

situation_id = "reddit/amitheasshole/" + train_test_valid_dataset['train'][0]['parent_id']
print(situation_id)

found_situatiuon = situations_dataset.filter(lambda example: example['situation-short-id'] == situation_id)
print(found_situatiuon['train'][:]['situation-short-id'])
found_situatiuon['train'][:]['situation']

reddit/amitheasshole/ar4oib
['reddit/amitheasshole/ar4oib', 'reddit/amitheasshole/ar4oib', 'reddit/amitheasshole/ar4oib']


['asking a coworker out every few months to see if shes reconsidered',
 'asking a coworker out every few months to see if shes reconsidered',
 'asking a coworker out every few months to see if shes reconsidered']

In [41]:
situation_id = "reddit/amitheasshole/" + train_test_valid_dataset['train'][0]['parent_id']

situations_dataset['train'][:]['situation-short-id'].index(situation_id)

295380

In [44]:
"""Method 1 -> takes too long"""
parent_id_to_situation = {}

def situation_mapping(example):
    situation_id = "reddit/amitheasshole/" + example['parent_id']
    index = situations_dataset['train'][:]['situation-short-id'].index(situation_id)

    parent_id_to_situation[situation_id] = index


train_test_valid_dataset.map(situation_mapping)

Map:   0%|          | 0/170149 [00:00<?, ? examples/s]


TypeError: can only concatenate str (not "list") to str

In [39]:
"""Method 2 -> takes too long"""

from transformers import AutoTokenizer
import numpy as np


"""Tokenize the data"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data(example):

    situation_id = "reddit/amitheasshole/" + example['parent_id']

    found_situation = situations_dataset.filter(lambda e: e['situation-short-id'] == situation_id)
    situation = found_situation['train'][0]['situation']

    encoding = tokenizer(example['body'], situation, padding="max_length", truncation=True)
    encoding['labels'] = label2id[ example['label'] ]

    return encoding

tokenized_dataset = train_test_valid_dataset.map(tokenize_data)
tokenized_dataset

Filter: 100%|██████████| 355922/355922 [00:08<00:00, 43235.28 examples/s]
Filter: 100%|██████████| 355922/355922 [00:08<00:00, 42470.81 examples/s]
Filter:  91%|█████████ | 324000/355922 [00:07<00:00, 42872.43 examples/s]
Map:   0%|          | 2/170149 [00:24<571:57:21, 12.10s/ examples]


KeyboardInterrupt: 

# BERT model for p(y|c,s)