In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"

In [2]:
SEED = 1234

# Social Comments Dataset

In [3]:
import pandas as pd

In [4]:
social_comments = pd.read_pickle('/home/IAIS/gplepi/entero/data_social_norms/social_comments_filtered.gzip', compression='gzip')
social_comments

Unnamed: 0,id,permalink,label,body,parent_id,author_fullname,author_name
13,eamexog,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,Uh absolutely NTA. These are really really hor...,a1311q,t2_2kabg9z7,xormun
14,eameha5,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,"NTA. Ok sweetie no, hell no this is not your f...",a1311q,t2_1jrodkow,tkPuncake
16,eamjnog,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,NTA. My girlfriend has hypothyroidism and i kn...,a1311q,t2_14ub01,hawkbearpig
17,ef5kbsb,/r/AmItheAsshole/comments/akkcpn/aita_for_not_...,NTA,You're clearly NTA. Sorry about your homophobi...,akkcpn,t2_61b3s,sadsquash
20,ef5l208,/r/AmItheAsshole/comments/akkcpn/aita_for_not_...,NTA,"NTA. And it will get better, I promise. You'll...",akkcpn,t2_xvrsh,SheketBevakaSTFU
...,...,...,...,...,...,...,...
530631,ei9ofvo,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,YTA,NAH (a bit towards yta) OP. You found the wors...,azofrl,t2_16fctm,xAlois
530636,ei9gkon,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,"NTA &#x200B; but you handled it really poorly,...",azofrl,t2_dk4gojr,YoungDiscord
530637,ei9gl79,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,NTA how are you the asshole? For like bigger b...,azofrl,t2_2xfoz1fv,Dark-_-Legacy
530639,ei9gmpk,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,NTA. Your girlfriend is overreacting. You don'...,azofrl,t2_15bdqt5w,Broken_Angel-


In [5]:
import re
from typing import Any
import string

"""
Text preprocessing: lowercase; remove punctuation; remove NTA_KEYWORDS, YTA_KEYWORDS and 'ampx200b', 'x200b', 'AITA', 'aita'
"""

class KeywordsCleaner:
    def __init__(self) -> None:
        # NTA YTA keywords
        NTA_KEYWORDS = ['nta', 'nah', 'you are not the asshole', 'you\'re not the asshole', 'u are not the asshole', 'u re not the asshole', 
                        'you re not the asshole', 'u\'re not the asshole', 'not the asshole', 'not the ah', 'not asshole', 'not ah']
        YTA_KEYWORDS = ['yta', 'you are the asshole', 'you\'re the asshole', 'u are the asshole', 'u re the asshole', 
                        'you re the asshole', 'u\'re the asshole', 'you the ah', 'you the asshole', 'u the asshole', 'u the ah']

        keywords_rep = {'ampx200b': "", 'x200b': "", 'AITA': "", 'aita': ""}
        
        for key in NTA_KEYWORDS + YTA_KEYWORDS:
            keywords_rep[key] = ""
        keywords_rep = dict(sorted(keywords_rep.items(), key=lambda k: len(k[0]), reverse=True))

        self.rep = dict((re.escape(k), v) for k, v in keywords_rep.items())
        self.pattern = re.compile("|".join(self.rep.keys()))

    def __call__(self, text: str) -> str:
        text = self.pattern.sub(lambda m: self.rep[re.escape(m.group(0))], text.lower())
        return text.translate(str.maketrans('', '', string.punctuation))   


In [6]:
"""Example"""
keywordsCleaner = KeywordsCleaner()

print(social_comments["body"].at[530636])
keywordsCleaner(social_comments["body"].at[530636])

NTA &#x200B; but you handled it really poorly, like that was the absolute worst way in which you could have ever said it &#x200B; why not acknowledge that she's beautiful and sexy in her own way? make a lateral move that you know, wouldn't require you to directly say: I'm not attracted to you physically? because that would just open up a can of worms. &#x200B; well, you messed it up so now you have to fix it. &#x200B; You're not an asshole for having a personal body type preference, everyone has but you are an idiot for handling it the way you did, good luck with that.


'  but you handled it really poorly like that was the absolute worst way in which you could have ever said it  why not acknowledge that shes beautiful and sexy in her own way make a lateral move that you know wouldnt require you to directly say im not attracted to you physically because that would just open up a can of worms  well you messed it up so now you have to fix it  youre not an asshole for having a personal body type preference everyone has but you are an idiot for handling it the way you did good luck with that'

In [7]:
"""Filter social comments"""
keywordsCleaner = KeywordsCleaner()

for i, row in social_comments.iterrows():
    row['body'] = keywordsCleaner(row['body'])

social_comments

Unnamed: 0,id,permalink,label,body,parent_id,author_fullname,author_name
13,eamexog,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,uh absolutely these are really really horrid ...,a1311q,t2_2kabg9z7,xormun
14,eameha5,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,ok sweetie no hell no this is not your fault ...,a1311q,t2_1jrodkow,tkPuncake
16,eamjnog,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,my girlfriend has hypothyroidism and i know t...,a1311q,t2_14ub01,hawkbearpig
17,ef5kbsb,/r/AmItheAsshole/comments/akkcpn/aita_for_not_...,NTA,youre clearly sorry about your homophobic fam...,akkcpn,t2_61b3s,sadsquash
20,ef5l208,/r/AmItheAsshole/comments/akkcpn/aita_for_not_...,NTA,and it will get better i promise youll make i...,akkcpn,t2_xvrsh,SheketBevakaSTFU
...,...,...,...,...,...,...,...
530631,ei9ofvo,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,YTA,a bit towards op you found the worst way to ...,azofrl,t2_16fctm,xAlois
530636,ei9gkon,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,but you handled it really poorly like that w...,azofrl,t2_dk4gojr,YoungDiscord
530637,ei9gl79,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,how are for like bigger boobs,azofrl,t2_2xfoz1fv,Dark-_-Legacy
530639,ei9gmpk,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,your girlfriend is overreacting you dont have...,azofrl,t2_15bdqt5w,Broken_Angel-


In [8]:
"""From pandas to Huggingface Dataset"""
from datasets import Dataset

social_comments_dataset = Dataset.from_pandas(social_comments)
social_comments_dataset

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__'],
    num_rows: 212687
})

In [9]:
"""Analyze the data"""
data_distribution = {'NTA': 0, 'YTA': 0}

def compute_data_distribution(example):
    data_distribution[example['label']] += 1

social_comments_dataset.map(compute_data_distribution)

print(data_distribution)
print(f"NTA: {data_distribution['NTA'] / sum(data_distribution.values())} ")
print(f"YTA: {data_distribution['YTA'] / sum(data_distribution.values())} ")

Map:   0%|          | 0/212687 [00:00<?, ? examples/s]

Map: 100%|██████████| 212687/212687 [00:09<00:00, 21853.93 examples/s]

{'NTA': 150040, 'YTA': 62647}
NTA: 0.7054497924179663 
YTA: 0.2945502075820337 





In [10]:
from datasets import DatasetDict

"""80-10-10 split"""
# 80% train, 20% test + validation
train_testvalid = social_comments_dataset.train_test_split(test_size=0.2, seed=SEED)
# Split the 20% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=SEED)

social_comments_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'val': test_valid['train']})

social_comments_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__'],
        num_rows: 21269
    })
})

In [11]:
id2label = {0: "NTA", 1: "YTA"}

label2id = {"NTA": 0, "YTA": 1}

# social-chemestry-101 Dataset

In [12]:
from datasets import load_dataset

social_chemestry_dataset = load_dataset("metaeval/social-chemestry-101")
social_chemestry_dataset

DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [13]:
from collections import defaultdict

situationId_to_situation = {}
situationId_to_ROT_moral_foundations = defaultdict(set)
situationId_to_ROT_categories = defaultdict(set)

In [14]:
def mapping_from_situationId(example):
    situation_id = example['situation-short-id'].split("/")[-1]

    situationId_to_situation[situation_id] = example['situation']
    if example['rot-moral-foundations'] is not None:
        situationId_to_ROT_moral_foundations[situation_id].add(example['rot-moral-foundations'])
    if example['rot-categorization'] is not None:
        situationId_to_ROT_categories[situation_id].add(example['rot-categorization'])

social_chemestry_dataset.map(mapping_from_situationId)

Map: 100%|██████████| 355922/355922 [00:44<00:00, 7991.88 examples/s]


DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [15]:
situationId_to_ROT_moral_foundations['adwxny']

{'care-harm|fairness-cheating',
 'care-harm|loyalty-betrayal',
 'fairness-cheating',
 'loyalty-betrayal'}

In [16]:
'. '.join(situationId_to_ROT_moral_foundations['adwxny'])

'loyalty-betrayal. care-harm|fairness-cheating. fairness-cheating. care-harm|loyalty-betrayal'

# Tokenizations

### Tokenization for p(y|c) -> Only comments texts (filtered from NTA/YTA tags)

In [17]:
from transformers import AutoTokenizer

"""Tokenize the data"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data_only_comments(example):
    encoding = tokenizer(example['body'], padding="max_length", truncation=True)
    encoding['labels'] = label2id[ example['label'] ]

    return encoding


### Tokenization for p(y|c,s) -> Situation Text + tokenizer.sep_token + comments text

In [18]:
from transformers import AutoTokenizer

"""Tokenize the data"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data_situations_comments(example):
    situation = situationId_to_situation[example['parent_id']]
    
    encoding = tokenizer(situation, example['body'], padding="max_length", truncation=True)
    encoding['labels'] = label2id[ example['label'] ]

    return encoding


### Tokenization for p(y|c,rot) -> Rot-moral-foundations +  tokenizer.sep_token + comments text 

In [19]:
from transformers import AutoTokenizer

"""Tokenize the data"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data_moralFoundations_comments(example):
    rots = '. '.join(situationId_to_ROT_moral_foundations[example['parent_id']])
    
    encoding = tokenizer(rots, example['body'], padding="max_length", truncation=True)
    encoding['labels'] = label2id[ example['label'] ]

    return encoding


### Tokenization for p(y|c,rot) -> Rot-categories +  tokenizer.sep_token + comments text 

In [20]:
from transformers import AutoTokenizer

"""Tokenize the data"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data_categories_comments(example):
    rots = '. '.join(situationId_to_ROT_categories[example['parent_id']])
    
    encoding = tokenizer(rots, example['body'], padding="max_length", truncation=True)
    encoding['labels'] = label2id[ example['label'] ]

    return encoding


### Tokenization for p(y|c,a) -> Author ID + Comment text

In [21]:
from transformers import AutoTokenizer

"""Tokenize the data"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data_authorId_comments(example):

    text = example['id'] + ". " + example['body'] 

    encoding = tokenizer(text, padding="max_length", truncation=True)
    encoding['labels'] = label2id[ example['label'] ]

    return encoding

### Tokenization for p(y|c,s,a) -> situation + tokenizer.sep_token + (Author ID + Comment text)

In [22]:
from transformers import AutoTokenizer

"""Tokenize the data"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data_situation_authorId_comments(example):
    situation = situationId_to_situation[example['parent_id']]
    text = example['id'] + ". " + example['body'] 

    encoding = tokenizer(text, situation, padding="max_length", truncation=True)
    encoding['labels'] = label2id[ example['label'] ]

    return encoding

# Training

In [18]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
"""Evaluation"""
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1", average='macro')

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    accuracy_score = accuracy.compute(predictions=predictions, references=labels)
    precision_score = precision.compute(predictions=predictions, references=labels)
    recall_score = recall.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels)
    
    return {
        "precision": precision_score['precision'],
        "recall": recall_score['recall'],
        "f1": f1_score['f1'],
        "accuracy": accuracy_score['accuracy'],
    }

## BERT model for p(y|c) -> only comments

In [20]:
"""Tokenize the dataset"""
tokenized_dataset = social_comments_dataset.map(tokenize_data_only_comments)

tokenized_dataset = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
tokenized_dataset

Map: 100%|██████████| 170149/170149 [01:52<00:00, 1517.50 examples/s]
Map: 100%|██████████| 21269/21269 [00:14<00:00, 1452.08 examples/s]
Map: 100%|██████████| 21269/21269 [00:14<00:00, 1441.29 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
})

In [21]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="output_social_norms/bert_comments_classification",
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_strategy="steps",
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [23]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.4063,0.370032,0.784541,0.584026,0.669594,0.833749
2000,0.3253,0.346103,0.7976,0.63912,0.709619,0.849123
3000,0.2896,0.342456,0.768642,0.703993,0.734899,0.853496




TrainOutput(global_step=3990, training_loss=0.34324239477477875, metrics={'train_runtime': 6816.8272, 'train_samples_per_second': 74.88, 'train_steps_per_second': 0.585, 'total_flos': 6.761758624175923e+16, 'train_loss': 0.34324239477477875, 'epoch': 3.0})

In [24]:
trainer.save_model()

In [25]:
"""Testing"""
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import classification_report

model_path = "/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification"

finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

"""Predict"""
predictions_output = trainer.predict(tokenized_dataset['test'])


preds = np.argmax(predictions_output.predictions, axis=-1)
labels = predictions_output.label_ids
print(classification_report(y_true=labels, y_pred=preds))

              precision    recall  f1-score   support

           0       0.88      0.91      0.90     14961
           1       0.77      0.72      0.74      6308

    accuracy                           0.85     21269
   macro avg       0.83      0.81      0.82     21269
weighted avg       0.85      0.85      0.85     21269



## BERT model for p(y|c,s) -> situations and comments

In [29]:
"""Tokenize the data"""
tokenized_dataset = social_comments_dataset.map(tokenize_data_situations_comments)

tokenized_dataset = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
tokenized_dataset

Map: 100%|██████████| 170149/170149 [02:04<00:00, 1371.26 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1334.15 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1349.91 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
})

In [30]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="output_social_norms/bert_comments_classification_situations_and_comments",
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_strategy="steps",
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [24]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.3698,0.318699,0.822873,0.673187,0.740542,0.863933
2000,0.2788,0.273159,0.813627,0.794132,0.803761,0.888147
3000,0.2362,0.254236,0.809449,0.835045,0.822047,0.895717
4000,0.1999,0.247678,0.815281,0.845314,0.830026,0.900136
5000,0.1885,0.241619,0.832551,0.837979,0.835256,0.90465




TrainOutput(global_step=5319, training_loss=0.27156329240133764, metrics={'train_runtime': 9059.5998, 'train_samples_per_second': 56.343, 'train_steps_per_second': 0.587, 'total_flos': 6.761758624175923e+16, 'train_loss': 0.27156329240133764, 'epoch': 3.0})

In [25]:
trainer.save_model()

In [32]:
"""Testing"""
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import classification_report

model_path = "/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_and_comments"

finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

"""Predict"""
predictions_output = trainer.predict(tokenized_dataset['test'])


preds = np.argmax(predictions_output.predictions, axis=-1)
labels = predictions_output.label_ids
print(classification_report(y_true=labels, y_pred=preds))



              precision    recall  f1-score   support

           0       0.93      0.93      0.93     14961
           1       0.83      0.83      0.83      6308

    accuracy                           0.90     21269
   macro avg       0.88      0.88      0.88     21269
weighted avg       0.90      0.90      0.90     21269



## BERT model for p(y|c,rot) -> ROT-moral_foundations and comments

In [33]:
"""Tokenize the data"""
tokenized_dataset = social_comments_dataset.map(tokenize_data_moralFoundations_comments)

tokenized_dataset = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
tokenized_dataset

Map: 100%|██████████| 170149/170149 [02:02<00:00, 1386.90 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1415.74 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1372.85 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
})

In [34]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="output_social_norms/bert_comments_classification_rot-moralFoundations_and_comments",
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_strategy="steps",
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [22]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.4266,0.387951,0.783579,0.538223,0.638129,0.823922
2000,0.3527,0.345055,0.759051,0.683456,0.719273,0.846114
3000,0.3109,0.335037,0.764696,0.712469,0.737659,0.853825
4000,0.2749,0.334269,0.762955,0.727139,0.744617,0.856129
5000,0.26,0.333623,0.774019,0.723553,0.747936,0.859326




TrainOutput(global_step=5319, training_loss=0.33911535295693357, metrics={'train_runtime': 9041.3826, 'train_samples_per_second': 56.457, 'train_steps_per_second': 0.588, 'total_flos': 6.761758624175923e+16, 'train_loss': 0.33911535295693357, 'epoch': 3.0})

In [23]:
trainer.save_model()

In [36]:
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import classification_report

model_path = "/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-moralFoundations_and_comments"

finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

"""Predict"""
predictions_output = trainer.predict(tokenized_dataset['test'])


preds = np.argmax(predictions_output.predictions, axis=-1)
labels = predictions_output.label_ids
print(classification_report(y_true=labels, y_pred=preds))



              precision    recall  f1-score   support

           0       0.89      0.90      0.89     14961
           1       0.76      0.73      0.74      6308

    accuracy                           0.85     21269
   macro avg       0.82      0.82      0.82     21269
weighted avg       0.85      0.85      0.85     21269



## BERT model for p(y|c,rot) -> ROT-categories and comments

In [37]:
"""Tokenize the data"""
tokenized_dataset = social_comments_dataset.map(tokenize_data_categories_comments)

tokenized_dataset = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
tokenized_dataset

Map: 100%|██████████| 170149/170149 [02:02<00:00, 1383.91 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1387.06 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1389.55 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
})

In [38]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="output_social_norms/bert_comments_classification_rot-categories_and_comments",
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_strategy="steps",
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [39]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [22]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.4379,0.398375,0.770591,0.539853,0.634908,0.820913
2000,0.3639,0.356032,0.759506,0.660962,0.706815,0.841836
3000,0.3183,0.336743,0.777654,0.685249,0.728533,0.852696
4000,0.2799,0.341323,0.752298,0.733659,0.742862,0.853496
5000,0.2654,0.341592,0.740406,0.748492,0.744427,0.851756




TrainOutput(global_step=5319, training_loss=0.34635972913931046, metrics={'train_runtime': 9014.7744, 'train_samples_per_second': 56.623, 'train_steps_per_second': 0.59, 'total_flos': 6.761758624175923e+16, 'train_loss': 0.34635972913931046, 'epoch': 3.0})

In [23]:
trainer.save_model()

In [40]:
"""Testing"""

from transformers import AutoModelForSequenceClassification

model_path = "/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-categories_and_comments"

finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [41]:
predictions_output = trainer.predict(tokenized_dataset['test'])
predictions_output



PredictionOutput(predictions=array([[-2.287483  ,  2.1564565 ],
       [-2.7786372 ,  2.615933  ],
       [ 3.6511924 , -3.284743  ],
       ...,
       [ 2.7265818 , -2.4175289 ],
       [ 2.183299  , -1.9042403 ],
       [ 0.7427094 , -0.52996004]], dtype=float32), label_ids=array([0, 1, 0, ..., 0, 0, 0]), metrics={'test_loss': 0.3509385287761688, 'test_precision': 0.7423022264329702, 'test_recall': 0.7452441344324667, 'test_f1': 0.7437702713392929, 'test_accuracy': 0.8477126334101274, 'test_runtime': 139.0999, 'test_samples_per_second': 152.905, 'test_steps_per_second': 1.596})

In [42]:
from sklearn.metrics import classification_report

preds = np.argmax(predictions_output.predictions, axis=-1)
labels = predictions_output.label_ids
print(classification_report(y_true=labels, y_pred=preds))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89     14961
           1       0.74      0.75      0.74      6308

    accuracy                           0.85     21269
   macro avg       0.82      0.82      0.82     21269
weighted avg       0.85      0.85      0.85     21269



## BERT model for p(y|c,a) -> Author ID and Comment text

In [43]:
"""Tokenize the data"""
tokenized_dataset = social_comments_dataset.map(tokenize_data_authorId_comments)

tokenized_dataset = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
tokenized_dataset

Map: 100%|██████████| 170149/170149 [02:00<00:00, 1417.20 examples/s]
Map: 100%|██████████| 21269/21269 [00:14<00:00, 1474.15 examples/s]
Map: 100%|██████████| 21269/21269 [00:14<00:00, 1459.20 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
})

In [44]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="output_social_norms/bert_comments_classification_authorId_and_comments",
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_strategy="steps",
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [45]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [28]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.4233,0.38424,0.79049,0.547351,0.646827,0.827589
2000,0.3494,0.347369,0.735421,0.696822,0.715601,0.840237
3000,0.3079,0.33566,0.754814,0.715566,0.734667,0.85091
4000,0.2713,0.334885,0.757336,0.732029,0.744467,0.855047
5000,0.2555,0.335809,0.761985,0.735778,0.748652,0.857492




TrainOutput(global_step=5319, training_loss=0.3343056802306773, metrics={'train_runtime': 9011.2021, 'train_samples_per_second': 56.646, 'train_steps_per_second': 0.59, 'total_flos': 6.761758624175923e+16, 'train_loss': 0.3343056802306773, 'epoch': 3.0})

In [29]:
trainer.save_model()

In [46]:
"""Testing"""
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import classification_report

model_path = "/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_authorId_and_comments"

finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

"""Predict"""
predictions_output = trainer.predict(tokenized_dataset['test'])


preds = np.argmax(predictions_output.predictions, axis=-1)
labels = predictions_output.label_ids
print(classification_report(y_true=labels, y_pred=preds))



              precision    recall  f1-score   support

           0       0.89      0.91      0.90     14961
           1       0.77      0.74      0.75      6308

    accuracy                           0.86     21269
   macro avg       0.83      0.82      0.83     21269
weighted avg       0.85      0.86      0.85     21269



## BERT model for p(y|c,s,a) -> situation and Author ID and Comment text

In [47]:
"""Tokenize the data"""
tokenized_dataset = social_comments_dataset.map(tokenize_data_situation_authorId_comments)

tokenized_dataset = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
tokenized_dataset

Map: 100%|██████████| 170149/170149 [02:06<00:00, 1347.95 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1362.44 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1368.59 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
})

In [48]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="output_social_norms/bert_comments_classification_situations_authorId_and_comments",
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_strategy="steps",
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [49]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [23]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.3767,0.328648,0.838625,0.628525,0.718532,0.857962
2000,0.2872,0.275822,0.823684,0.765281,0.793409,0.885044
3000,0.243,0.259722,0.820581,0.805868,0.813158,0.893178
4000,0.2022,0.256249,0.818794,0.832274,0.825479,0.898491
5000,0.1923,0.25266,0.82554,0.834556,0.830024,0.901406




TrainOutput(global_step=5319, training_loss=0.27660126891389875, metrics={'train_runtime': 8960.0337, 'train_samples_per_second': 56.969, 'train_steps_per_second': 0.594, 'total_flos': 6.761758624175923e+16, 'train_loss': 0.27660126891389875, 'epoch': 3.0})

In [24]:
trainer.save_model()

In [50]:
"""Testing"""
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import classification_report

model_path = "/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_authorId_and_comments"

finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

"""Predict"""
predictions_output = trainer.predict(tokenized_dataset['test'])


preds = np.argmax(predictions_output.predictions, axis=-1)
labels = predictions_output.label_ids
print(classification_report(y_true=labels, y_pred=preds))



              precision    recall  f1-score   support

           0       0.93      0.92      0.93     14961
           1       0.82      0.83      0.83      6308

    accuracy                           0.90     21269
   macro avg       0.87      0.88      0.88     21269
weighted avg       0.90      0.90      0.90     21269



# Inference

In [31]:
"""Only comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification'

nta_comment = "If it's for your best, than do not worry about it."
yta_comment = "You shouldn't have done that, it's not allowed."
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(nta_comment)
print(f"{nta_comment} -> {answer}")

answer = clf(yta_comment)
print(f"{yta_comment} -> {answer}")

If it's for your best, than do not worry about it. -> [{'label': 'NTA', 'score': 0.9908214807510376}]
You shouldn't have done that, it's not allowed. -> [{'label': 'NTA', 'score': 0.5540988445281982}]


In [32]:
"""Situations and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_and_comments'

situation = "Want to study abroad, but feel bad leaving my country."
nta_comment = "If it's for your best, than do not worry about it."
yta_comment = "You shouldn't have done that, it's not good behaviour."
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

print(f"The situations is: {situation}")
print("The comments:")

answer = clf(situation + " " + tokenizer.sep_token + " " + nta_comment)
print(f"{nta_comment} -> {answer}")

answer = clf(situation + " " + tokenizer.sep_token + " " + nta_comment)
print(f"{yta_comment} -> {answer}")

The situations is: Want to study abroad, but feel bad leaving my country.
The comments:
If it's for your best, than do not worry about it. -> [{'label': 'NTA', 'score': 0.9957548379898071}]
You shouldn't have done that, it's not good behaviour. -> [{'label': 'NTA', 'score': 0.9957548379898071}]


In [33]:
"""rot-moralFoundations and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-moralFoundations_and_comments'

rot_moralFoundation = 'loyalty-betrayal'
nta_comment = "If it's for your best, than do not worry about it."
yta_comment = "You shouldn't have done that, it's not allowed."
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(rot_moralFoundation + " " + tokenizer.sep_token + " " + nta_comment)
print(f"{nta_comment} -> {answer}")

answer = clf(rot_moralFoundation + " " + tokenizer.sep_token + " " + yta_comment)
print(f"{yta_comment} -> {answer}")

If it's for your best, than do not worry about it. -> [{'label': 'NTA', 'score': 0.992794930934906}]
You shouldn't have done that, it's not allowed. -> [{'label': 'YTA', 'score': 0.5742753148078918}]


In [34]:
"""rot-categories and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-categories_and_comments'

rot_category = 'morality-ethics'
nta_comment = "If it's for your best, than do not worry about it."
yta_comment = "You shouldn't have done that, it's not allowed."
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(rot_category + " " + tokenizer.sep_token + " " + nta_comment)
print(f"{nta_comment} -> {answer}")

answer = clf(rot_category + " " + tokenizer.sep_token + " " + yta_comment)
print(f"{yta_comment} -> {answer}")

If it's for your best, than do not worry about it. -> [{'label': 'NTA', 'score': 0.9928336143493652}]
You shouldn't have done that, it's not allowed. -> [{'label': 'NTA', 'score': 0.6566068530082703}]


In [35]:
"""AuthorId and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_authorId_and_comments'

authorId = 'ei9ofvo'
nta_comment = "If it's for your best, than do not worry about it."
yta_comment = "You shouldn't have done that, it's not allowed."
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(authorId + ". " + nta_comment)
print(f"{nta_comment} -> {answer}")

answer = clf(authorId + ". " + yta_comment)
print(f"{yta_comment} -> {answer}")

If it's for your best, than do not worry about it. -> [{'label': 'NTA', 'score': 0.9857268929481506}]
You shouldn't have done that, it's not allowed. -> [{'label': 'YTA', 'score': 0.8293914794921875}]


In [26]:
"""Situations and AuthorId and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_authorId_and_comments'

authorId = 'ei9ofvo'
situation = "Want to study abroad, but feel bad leaving my country."

nta_comment = "If it's for your best, than do not worry about it."
yta_comment = "You shouldn't have done that, it's not allowed."
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf( situation + " " + tokenizer.sep_token + " " + (authorId + ". " + nta_comment) )
print(f"{nta_comment} -> {answer}")

answer = clf( situation + " " + tokenizer.sep_token + " " +  (authorId + ". " + yta_comment) )
print(f"{yta_comment} -> {answer}")

If it's for your best, than do not worry about it. -> [{'label': 'NTA', 'score': 0.9942271709442139}]
You shouldn't have done that, it's not allowed. -> [{'label': 'NTA', 'score': 0.9426610469818115}]


# Inference with examples from the test dataset -> CORRECT cases

In [40]:
nta_data = social_comments_dataset['test'][2]
nta_data

{'id': 'ec9nvqg',
 'permalink': '/r/AmItheAsshole/comments/a8cvvc/aita_for_asking_my_fiancé_for_a_heads_up_when/ec9nvqg/',
 'label': 'NTA',
 'body': ' youre not saying he cant invite someone over youre asking for a simple text or phone call before you get home so you can melly prepare yourself',
 'parent_id': 'a8cvvc',
 'author_fullname': 't2_2n4q22oj',
 'author_name': 'callmelola32',
 '__index_level_0__': 17521}

In [41]:
situationId_to_situation[nta_data['parent_id']]

'asking my fiancé for a heads up when having people to our apartment'

In [42]:
yta_data = social_comments_dataset['test'][1]
yta_data

{'id': 'ek6fhlk',
 'permalink': '/r/AmItheAsshole/comments/b9pu2q/wibta_if_i_cut_my_childfree_kids_out_of_my_will/ek6fhlk/',
 'label': 'YTA',
 'body': ' you start by saying that you arent trying to coerce them into having children but then you also state later on that you are hoping they will grow out of it you are making it extremely clear that because you dont agree with their beliefs they are worth less than your children who are choosing to have kids you have no idea how this is going to tear your family apart if you cut them out of your will why is spending money on their childfree lifestyle so much more unimportant than your kids who are having children this is just a complete manipulation tactic on your part ',
 'parent_id': 'b9pu2q',
 'author_fullname': 't2_nlqcc',
 'author_name': 'MissMamanda',
 '__index_level_0__': 88625}

In [43]:
situationId_to_situation[yta_data['parent_id']]

'cutting my "child-free" kids out of my will'

In [44]:
"""Only comments"""
from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification'

nta_comment = nta_data['body']
yta_comment = yta_data['body']
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(nta_comment)
print(f"NTA comment -> {answer}")

answer = clf(yta_comment)
print(f"YTA comment -> {answer}")

NTA comment -> [{'label': 'NTA', 'score': 0.9988065958023071}]
YTA comment -> [{'label': 'YTA', 'score': 0.9932330250740051}]


In [45]:
"""Situations and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_and_comments'

nta_situation = situationId_to_situation[nta_data['parent_id']]
yta_situation = situationId_to_situation[yta_data['parent_id']]

nta_comment = nta_data['body']
yta_comment = yta_data['body']
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

nta_input = nta_situation + " " + tokenizer.sep_token + " " + nta_comment
yta_input = yta_situation + " " + tokenizer.sep_token + " " + yta_comment

print(f"NTA input: {nta_input}")
print(f"YTA input: {yta_input}")

answer = clf(nta_input)
print(f"NTA comment -> {answer}")

answer = clf(yta_input)
print(f"YTA comment -> {answer}")

NTA input: asking my fiancé for a heads up when having people to our apartment [SEP]  youre not saying he cant invite someone over youre asking for a simple text or phone call before you get home so you can melly prepare yourself
YTA input: cutting my "child-free" kids out of my will [SEP]  you start by saying that you arent trying to coerce them into having children but then you also state later on that you are hoping they will grow out of it you are making it extremely clear that because you dont agree with their beliefs they are worth less than your children who are choosing to have kids you have no idea how this is going to tear your family apart if you cut them out of your will why is spending money on their childfree lifestyle so much more unimportant than your kids who are having children this is just a complete manipulation tactic on your part 
NTA comment -> [{'label': 'NTA', 'score': 0.9994131326675415}]
YTA comment -> [{'label': 'YTA', 'score': 0.996910035610199}]


In [46]:
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification
import torch.nn.functional as F

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_and_comments'
finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

nta_situation = situationId_to_situation[nta_data['parent_id']]
yta_situation = situationId_to_situation[yta_data['parent_id']]

nta_comment = nta_data['body']
yta_comment = yta_data['body']


nta_input = tokenizer(nta_situation, nta_comment, padding="max_length", truncation=True, return_tensors="pt")
yta_input = tokenizer(yta_situation, yta_comment, padding="max_length", truncation=True, return_tensors="pt")

with torch.no_grad():
    nta_outputs = finetuned_model(**nta_input)
    yta_outputs = finetuned_model(**yta_input)

nta_logits = nta_outputs.logits
yta_logits = yta_outputs.logits

nta_predicted_class_id = id2label[nta_logits.argmax().item()]
yta_predicted_class_id = id2label[yta_logits.argmax().item()]

nta_probability = F.softmax(nta_outputs.logits, dim=1)
yta_probability = F.softmax(yta_outputs.logits, dim=1)

print(f"NTA comment -> {nta_predicted_class_id}, with score {nta_probability[0, label2id[nta_predicted_class_id]].item()}")

print(f"YTA comment -> {yta_predicted_class_id}, with score {yta_probability[0, label2id[yta_predicted_class_id]].item()}")

NTA comment -> NTA, with score 0.9994131326675415
YTA comment -> YTA, with score 0.996910035610199


In [47]:
"""rot-moralFoundations and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-moralFoundations_and_comments'

nta_rot_moralFoundations = '. '.join(situationId_to_ROT_moral_foundations[nta_data['parent_id']])
yta_rot_moralFoundations = '. '.join(situationId_to_ROT_moral_foundations[yta_data['parent_id']])

nta_comment = nta_data['body']
yta_comment = yta_data['body']
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(nta_rot_moralFoundations + " " + tokenizer.sep_token + " " + nta_comment)
print(f"NTA comment -> {answer}")

answer = clf(yta_rot_moralFoundations + " " + tokenizer.sep_token + " " + yta_comment)
print(f"YTA comment -> {answer}")

NTA comment -> [{'label': 'NTA', 'score': 0.9993882179260254}]
YTA comment -> [{'label': 'YTA', 'score': 0.9943819642066956}]


In [48]:
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification
import torch.nn.functional as F

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-moralFoundations_and_comments'
finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

nta_rot_moralFoundations = '. '.join(situationId_to_ROT_moral_foundations[nta_data['parent_id']])
yta_rot_moralFoundations = '. '.join(situationId_to_ROT_moral_foundations[yta_data['parent_id']])

nta_comment = nta_data['body']
yta_comment = yta_data['body']

nta_input = tokenizer(nta_rot_moralFoundations, nta_comment, padding="max_length", truncation=True, return_tensors="pt")
yta_input = tokenizer(yta_rot_moralFoundations, yta_comment, padding="max_length", truncation=True, return_tensors="pt")

with torch.no_grad():
    nta_outputs = finetuned_model(**nta_input)
   
with torch.no_grad():
    yta_outputs = finetuned_model(**yta_input)

nta_logits = nta_outputs.logits
yta_logits = yta_outputs.logits

nta_predicted_class_id = id2label[nta_logits.argmax().item()]
yta_predicted_class_id = id2label[yta_logits.argmax().item()]

nta_probability = F.softmax(nta_outputs.logits, dim=1)
yta_probability = F.softmax(yta_outputs.logits, dim=1)

print(f"NTA comment -> {nta_predicted_class_id}, with score {nta_probability[0, label2id[nta_predicted_class_id]].item()}")

print(f"YTA comment -> {yta_predicted_class_id}, with score {yta_probability[0, label2id[yta_predicted_class_id]].item()}")

NTA comment -> NTA, with score 0.9993882179260254
YTA comment -> YTA, with score 0.9943819642066956


In [49]:
"""rot-categories and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-categories_and_comments'

nta_rot_categories = '. '.join(situationId_to_ROT_categories[nta_data['parent_id']])
yta_rot_categories = '. '.join(situationId_to_ROT_categories[yta_data['parent_id']])

nta_comment = nta_data['body']
yta_comment = yta_data['body']
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(nta_rot_categories + " " + tokenizer.sep_token + " " + nta_comment)
print(f"NTA comment -> {answer}")

answer = clf(yta_rot_categories + " " + tokenizer.sep_token + " " + yta_comment)
print(f"YTA comment -> {answer}")

NTA comment -> [{'label': 'NTA', 'score': 0.9990286827087402}]
YTA comment -> [{'label': 'YTA', 'score': 0.9954793453216553}]


In [50]:
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification
import torch.nn.functional as F

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-categories_and_comments'
finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

nta_rot_categories = '. '.join(situationId_to_ROT_categories[nta_data['parent_id']])
yta_rot_categories = '. '.join(situationId_to_ROT_categories[yta_data['parent_id']])

nta_comment = nta_data['body']
yta_comment = yta_data['body']

nta_input = tokenizer(nta_rot_categories, nta_comment, padding="max_length", truncation=True, return_tensors="pt")
yta_input = tokenizer(yta_rot_categories, yta_comment, padding="max_length", truncation=True, return_tensors="pt")

with torch.no_grad():
    nta_outputs = finetuned_model(**nta_input)
   
with torch.no_grad():
    yta_outputs = finetuned_model(**yta_input)

nta_logits = nta_outputs.logits
yta_logits = yta_outputs.logits

nta_predicted_class_id = id2label[nta_logits.argmax().item()]
yta_predicted_class_id = id2label[yta_logits.argmax().item()]

nta_probability = F.softmax(nta_outputs.logits, dim=1)
yta_probability = F.softmax(yta_outputs.logits, dim=1)

print(f"NTA comment -> {nta_predicted_class_id}, with score {nta_probability[0, label2id[nta_predicted_class_id]].item()}")

print(f"YTA comment -> {yta_predicted_class_id}, with score {yta_probability[0, label2id[yta_predicted_class_id]].item()}")

NTA comment -> NTA, with score 0.9990286827087402
YTA comment -> YTA, with score 0.9954793453216553


In [51]:
"""AuthorId and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_authorId_and_comments'

nta_authorId = nta_data['id']
yta_authorId = yta_data['id']

nta_comment = nta_data['body']
yta_comment = yta_data['body']
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(nta_authorId + ". " + nta_comment)
print(f"NTA comment -> {answer}")

answer = clf(yta_authorId + ". " + yta_comment)
print(f"YTA comment -> {answer}")

NTA comment -> [{'label': 'NTA', 'score': 0.9992976188659668}]
YTA comment -> [{'label': 'YTA', 'score': 0.9959738850593567}]


In [52]:
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification
import torch.nn.functional as F

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_authorId_and_comments'
finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

nta_authorId = nta_data['id']
yta_authorId = yta_data['id']

nta_comment = nta_data['body']
yta_comment = yta_data['body']

nta_text = nta_authorId + ". " + nta_comment
yta_text = yta_authorId + ". " + yta_comment

nta_input = tokenizer(nta_text, padding="max_length", truncation=True, return_tensors="pt")
yta_input = tokenizer(yta_text, padding="max_length", truncation=True, return_tensors="pt")

with torch.no_grad():
    nta_outputs = finetuned_model(**nta_input)
   
with torch.no_grad():
    yta_outputs = finetuned_model(**yta_input)

nta_logits = nta_outputs.logits
yta_logits = yta_outputs.logits

nta_predicted_class_id = id2label[nta_logits.argmax().item()]
yta_predicted_class_id = id2label[yta_logits.argmax().item()]

nta_probability = F.softmax(nta_outputs.logits, dim=1)
yta_probability = F.softmax(yta_outputs.logits, dim=1)

print(f"NTA comment -> {nta_predicted_class_id}, with score {nta_probability[0, label2id[nta_predicted_class_id]].item()}")

print(f"YTA comment -> {yta_predicted_class_id}, with score {yta_probability[0, label2id[yta_predicted_class_id]].item()}")

NTA comment -> NTA, with score 0.9992976188659668
YTA comment -> YTA, with score 0.9959738850593567


In [53]:
"""Situations and AuthorId and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_authorId_and_comments'

nta_authorId = nta_data['id']
yta_authorId = yta_data['id']

nta_comment = nta_data['body']
yta_comment = yta_data['body']

nta_text = nta_authorId + ". " + nta_comment
yta_text = yta_authorId + ". " + yta_comment

nta_situation = situationId_to_situation[nta_data['parent_id']]
yta_situation = situationId_to_situation[yta_data['parent_id']]

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf( nta_text + " " + tokenizer.sep_token + " " + nta_situation )
print(f"NTA comment -> {answer}")

answer = clf( yta_text + " " + tokenizer.sep_token + " " +  yta_situation )
print(f"YTA comment -> {answer}")

NTA comment -> [{'label': 'NTA', 'score': 0.9993768334388733}]
YTA comment -> [{'label': 'YTA', 'score': 0.9959113597869873}]


In [54]:
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification
import torch.nn.functional as F

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_authorId_and_comments'
finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

nta_authorId = nta_data['id']
yta_authorId = yta_data['id']

nta_comment = nta_data['body']
yta_comment = yta_data['body']

nta_text = nta_authorId + ". " + nta_comment
yta_text = yta_authorId + ". " + yta_comment

nta_situation = situationId_to_situation[nta_data['parent_id']]
yta_situation = situationId_to_situation[yta_data['parent_id']]

nta_input = tokenizer(nta_text, nta_situation, padding="max_length", truncation=True, return_tensors="pt")
yta_input = tokenizer(yta_text, yta_situation, padding="max_length", truncation=True, return_tensors="pt")

with torch.no_grad():
    nta_outputs = finetuned_model(**nta_input)
   
with torch.no_grad():
    yta_outputs = finetuned_model(**yta_input)

nta_logits = nta_outputs.logits
yta_logits = yta_outputs.logits

nta_predicted_class_id = id2label[nta_logits.argmax().item()]
yta_predicted_class_id = id2label[yta_logits.argmax().item()]

nta_probability = F.softmax(nta_outputs.logits, dim=1)
yta_probability = F.softmax(yta_outputs.logits, dim=1)

print(f"NTA comment -> {nta_predicted_class_id}, with score {nta_probability[0, label2id[nta_predicted_class_id]].item()}")

print(f"YTA comment -> {yta_predicted_class_id}, with score {yta_probability[0, label2id[yta_predicted_class_id]].item()}")

NTA comment -> NTA, with score 0.9993768334388733
YTA comment -> YTA, with score 0.9959113597869873


# Inference with examples from the test dataset -> FAILURE cases

In [55]:
nta_data = social_comments_dataset['test'][0]
nta_data

{'id': 'eivr3oy',
 'permalink': '/r/AmItheAsshole/comments/b2yamg/aita_for_breaking_off_my_engagement_over_his/eivr3oy/',
 'label': 'NTA',
 'body': ' he cant help how his parents are but you gave him an out by confronting him and asking if he was a racist his nonresponse is all you really needed you went into the wedding with the intention of marrying him and then you realized that it wasnt a decision that you would be happy with for the rest of your life due to new information nobody can ask more of you than that ',
 'parent_id': 'b2yamg',
 'author_fullname': 't2_6u2o6',
 'author_name': 'Jayrodtremonki',
 '__index_level_0__': 359384}

In [56]:
situationId_to_situation[nta_data['parent_id']]

"breaking off my engagement over his father's racist remarks"

In [57]:
yta_data = social_comments_dataset['test'][10]
yta_data

{'id': 'edtxoas',
 'permalink': '/r/AmItheAsshole/comments/aex00x/aita_for_being_annoyed_because_my_girlfriend_pees/edtxoas/',
 'label': 'YTA',
 'body': 'honestly i would be more concerned on why she has to go so frequently i would think there is something wrong and yeah it kinda sounds like ',
 'parent_id': 'aex00x',
 'author_fullname': 't2_2vgdjz0c',
 'author_name': 'Dotori_Dan',
 '__index_level_0__': 250034}

In [58]:
situationId_to_situation[yta_data['parent_id']]

'being annoyed because my girlfriend pees so much'

In [59]:
"""Only comments"""
from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification'

nta_comment = nta_data['body']
yta_comment = yta_data['body']
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(nta_comment)
print(f"NTA comment -> {answer}") #confused as YTA, maybe because how the comment starts: "he cant help how his parents are", which indicates initially that the asker is wrong, but then the comment changes notation and the commenter gives the right to the asker

answer = clf(yta_comment)
print(f"YTA comment -> {answer}") #confused as NTA, maybe because it is an ironic comment?

NTA comment -> [{'label': 'YTA', 'score': 0.9471924304962158}]
YTA comment -> [{'label': 'NTA', 'score': 0.9383085370063782}]


In [60]:
yta_comment

'honestly i would be more concerned on why she has to go so frequently i would think there is something wrong and yeah it kinda sounds like '

In [61]:
"""Situations and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_and_comments'

nta_situation = situationId_to_situation[nta_data['parent_id']]
yta_situation = situationId_to_situation[yta_data['parent_id']]

nta_comment = nta_data['body']
yta_comment = yta_data['body']
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

nta_input = nta_situation + " " + tokenizer.sep_token + " " + nta_comment
yta_input = yta_situation + " " + tokenizer.sep_token + " " + yta_comment

print(f"NTA input: {nta_input}")
print(f"YTA input: {yta_input}")

answer = clf(nta_input)
print(f"NTA comment -> {answer}") #confused as YTA maybe because how the comment starts: "he cant help how his parents are", which indicates initially that the asker is wrong, but then the comment changes notation and the commenter gives the right to the asker

answer = clf(yta_input)
print(f"YTA comment -> {answer}") #The only model that predicts it correctly!!!

NTA input: breaking off my engagement over his father's racist remarks [SEP]  he cant help how his parents are but you gave him an out by confronting him and asking if he was a racist his nonresponse is all you really needed you went into the wedding with the intention of marrying him and then you realized that it wasnt a decision that you would be happy with for the rest of your life due to new information nobody can ask more of you than that 
YTA input: being annoyed because my girlfriend pees so much [SEP] honestly i would be more concerned on why she has to go so frequently i would think there is something wrong and yeah it kinda sounds like 
NTA comment -> [{'label': 'YTA', 'score': 0.978890597820282}]
YTA comment -> [{'label': 'YTA', 'score': 0.827903151512146}]


In [62]:
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification
import torch.nn.functional as F

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_and_comments'
finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

nta_situation = situationId_to_situation[nta_data['parent_id']]
yta_situation = situationId_to_situation[yta_data['parent_id']]

nta_comment = nta_data['body']
yta_comment = yta_data['body']


nta_input = tokenizer(nta_situation, nta_comment, padding="max_length", truncation=True, return_tensors="pt")
yta_input = tokenizer(yta_situation, yta_comment, padding="max_length", truncation=True, return_tensors="pt")

with torch.no_grad():
    nta_outputs = finetuned_model(**nta_input)
    yta_outputs = finetuned_model(**yta_input)

nta_logits = nta_outputs.logits
yta_logits = yta_outputs.logits

nta_predicted_class_id = id2label[nta_logits.argmax().item()]
yta_predicted_class_id = id2label[yta_logits.argmax().item()]

nta_probability = F.softmax(nta_outputs.logits, dim=1)
yta_probability = F.softmax(yta_outputs.logits, dim=1)

print(f"NTA comment -> {nta_predicted_class_id}, with score {nta_probability[0, label2id[nta_predicted_class_id]].item()}")

print(f"YTA comment -> {yta_predicted_class_id}, with score {yta_probability[0, label2id[yta_predicted_class_id]].item()}")

NTA comment -> YTA, with score 0.978890597820282
YTA comment -> YTA, with score 0.8279033899307251


In [63]:
"""rot-moralFoundations and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-moralFoundations_and_comments'

nta_rot_moralFoundations = '. '.join(situationId_to_ROT_moral_foundations[nta_data['parent_id']])
yta_rot_moralFoundations = '. '.join(situationId_to_ROT_moral_foundations[yta_data['parent_id']])

nta_comment = nta_data['body']
yta_comment = yta_data['body']
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(nta_rot_moralFoundations + " " + tokenizer.sep_token + " " + nta_comment)
print(f"NTA comment -> {answer}")

answer = clf(yta_rot_moralFoundations + " " + tokenizer.sep_token + " " + yta_comment)
print(f"YTA comment -> {answer}")

NTA comment -> [{'label': 'YTA', 'score': 0.9737996459007263}]
YTA comment -> [{'label': 'NTA', 'score': 0.9811087250709534}]


In [64]:
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification
import torch.nn.functional as F

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-moralFoundations_and_comments'
finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

nta_rot_moralFoundations = '. '.join(situationId_to_ROT_moral_foundations[nta_data['parent_id']])
yta_rot_moralFoundations = '. '.join(situationId_to_ROT_moral_foundations[yta_data['parent_id']])

nta_comment = nta_data['body']
yta_comment = yta_data['body']

nta_input = tokenizer(nta_rot_moralFoundations, nta_comment, padding="max_length", truncation=True, return_tensors="pt")
yta_input = tokenizer(yta_rot_moralFoundations, yta_comment, padding="max_length", truncation=True, return_tensors="pt")

with torch.no_grad():
    nta_outputs = finetuned_model(**nta_input)
   
with torch.no_grad():
    yta_outputs = finetuned_model(**yta_input)

nta_logits = nta_outputs.logits
yta_logits = yta_outputs.logits

nta_predicted_class_id = id2label[nta_logits.argmax().item()]
yta_predicted_class_id = id2label[yta_logits.argmax().item()]

nta_probability = F.softmax(nta_outputs.logits, dim=1)
yta_probability = F.softmax(yta_outputs.logits, dim=1)

print(f"NTA comment -> {nta_predicted_class_id}, with score {nta_probability[0, label2id[nta_predicted_class_id]].item()}")

print(f"YTA comment -> {yta_predicted_class_id}, with score {yta_probability[0, label2id[yta_predicted_class_id]].item()}")

NTA comment -> YTA, with score 0.9737995266914368
YTA comment -> NTA, with score 0.9811088442802429


In [65]:
"""rot-categories and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-categories_and_comments'

nta_rot_categories = '. '.join(situationId_to_ROT_categories[nta_data['parent_id']])
yta_rot_categories = '. '.join(situationId_to_ROT_categories[yta_data['parent_id']])

nta_comment = nta_data['body']
yta_comment = yta_data['body']
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(nta_rot_categories + " " + tokenizer.sep_token + " " + nta_comment)
print(f"NTA comment -> {answer}")

answer = clf(yta_rot_categories + " " + tokenizer.sep_token + " " + yta_comment)
print(f"YTA comment -> {answer}")

NTA comment -> [{'label': 'YTA', 'score': 0.9883868098258972}]
YTA comment -> [{'label': 'NTA', 'score': 0.9296041131019592}]


In [66]:
"""AuthorId and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_authorId_and_comments'

nta_authorId = nta_data['id']
yta_authorId = yta_data['id']

nta_comment = nta_data['body']
yta_comment = yta_data['body']
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(nta_authorId + ". " + nta_comment)
print(f"NTA comment -> {answer}")

answer = clf(yta_authorId + ". " + yta_comment)
print(f"YTA comment -> {answer}")

NTA comment -> [{'label': 'YTA', 'score': 0.9721246957778931}]
YTA comment -> [{'label': 'NTA', 'score': 0.9558769464492798}]


In [67]:
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification
import torch.nn.functional as F

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_authorId_and_comments'
finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

nta_authorId = nta_data['id']
yta_authorId = yta_data['id']

nta_comment = nta_data['body']
yta_comment = yta_data['body']

nta_text = nta_authorId + ". " + nta_comment
yta_text = yta_authorId + ". " + yta_comment

nta_input = tokenizer(nta_text, padding="max_length", truncation=True, return_tensors="pt")
yta_input = tokenizer(yta_text, padding="max_length", truncation=True, return_tensors="pt")

with torch.no_grad():
    nta_outputs = finetuned_model(**nta_input)
   
with torch.no_grad():
    yta_outputs = finetuned_model(**yta_input)

nta_logits = nta_outputs.logits
yta_logits = yta_outputs.logits

nta_predicted_class_id = id2label[nta_logits.argmax().item()]
yta_predicted_class_id = id2label[yta_logits.argmax().item()]

nta_probability = F.softmax(nta_outputs.logits, dim=1)
yta_probability = F.softmax(yta_outputs.logits, dim=1)

print(f"NTA comment -> {nta_predicted_class_id}, with score {nta_probability[0, label2id[nta_predicted_class_id]].item()}")

print(f"YTA comment -> {yta_predicted_class_id}, with score {yta_probability[0, label2id[yta_predicted_class_id]].item()}")

NTA comment -> YTA, with score 0.9721246957778931
YTA comment -> NTA, with score 0.9558769464492798


In [68]:
"""Situations and AuthorId and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_authorId_and_comments'

nta_authorId = nta_data['id']
yta_authorId = yta_data['id']

nta_comment = nta_data['body']
yta_comment = yta_data['body']

nta_text = nta_authorId + ". " + nta_comment
yta_text = yta_authorId + ". " + yta_comment

nta_situation = situationId_to_situation[nta_data['parent_id']]
yta_situation = situationId_to_situation[yta_data['parent_id']]

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf( nta_text + " " + tokenizer.sep_token + " " + nta_situation )
print(f"NTA comment -> {answer}")

answer = clf( yta_text + " " + tokenizer.sep_token + " " +  yta_situation ) #predicts this correctly same as in situatuon + comment 
print(f"YTA comment -> {answer}")

NTA comment -> [{'label': 'YTA', 'score': 0.9664369821548462}]
YTA comment -> [{'label': 'YTA', 'score': 0.9623212218284607}]


In [69]:
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification
import torch.nn.functional as F

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_authorId_and_comments'
finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

nta_authorId = nta_data['id']
yta_authorId = yta_data['id']

nta_comment = nta_data['body']
yta_comment = yta_data['body']

nta_text = nta_authorId + ". " + nta_comment
yta_text = yta_authorId + ". " + yta_comment

nta_situation = situationId_to_situation[nta_data['parent_id']]
yta_situation = situationId_to_situation[yta_data['parent_id']]

nta_input = tokenizer(nta_text, nta_situation, padding="max_length", truncation=True, return_tensors="pt")
yta_input = tokenizer(yta_text, yta_situation, padding="max_length", truncation=True, return_tensors="pt")

with torch.no_grad():
    nta_outputs = finetuned_model(**nta_input)
   
with torch.no_grad():
    yta_outputs = finetuned_model(**yta_input)

nta_logits = nta_outputs.logits
yta_logits = yta_outputs.logits

nta_predicted_class_id = id2label[nta_logits.argmax().item()]
yta_predicted_class_id = id2label[yta_logits.argmax().item()]

nta_probability = F.softmax(nta_outputs.logits, dim=1)
yta_probability = F.softmax(yta_outputs.logits, dim=1)

print(f"NTA comment -> {nta_predicted_class_id}, with score {nta_probability[0, label2id[nta_predicted_class_id]].item()}")

print(f"YTA comment -> {yta_predicted_class_id}, with score {yta_probability[0, label2id[yta_predicted_class_id]].item()}")

NTA comment -> YTA, with score 0.9664369821548462
YTA comment -> YTA, with score 0.9623212218284607
