In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"

In [2]:
SEED = 1234

# Social Comments Dataset

In [3]:
import pandas as pd

In [4]:
social_comments = pd.read_pickle('/home/IAIS/gplepi/entero/data_social_norms/social_comments_filtered.gzip', compression='gzip')
social_comments

Unnamed: 0,id,permalink,label,body,parent_id,author_fullname,author_name
13,eamexog,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,Uh absolutely NTA. These are really really hor...,a1311q,t2_2kabg9z7,xormun
14,eameha5,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,"NTA. Ok sweetie no, hell no this is not your f...",a1311q,t2_1jrodkow,tkPuncake
16,eamjnog,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,NTA. My girlfriend has hypothyroidism and i kn...,a1311q,t2_14ub01,hawkbearpig
17,ef5kbsb,/r/AmItheAsshole/comments/akkcpn/aita_for_not_...,NTA,You're clearly NTA. Sorry about your homophobi...,akkcpn,t2_61b3s,sadsquash
20,ef5l208,/r/AmItheAsshole/comments/akkcpn/aita_for_not_...,NTA,"NTA. And it will get better, I promise. You'll...",akkcpn,t2_xvrsh,SheketBevakaSTFU
...,...,...,...,...,...,...,...
530631,ei9ofvo,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,YTA,NAH (a bit towards yta) OP. You found the wors...,azofrl,t2_16fctm,xAlois
530636,ei9gkon,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,"NTA &#x200B; but you handled it really poorly,...",azofrl,t2_dk4gojr,YoungDiscord
530637,ei9gl79,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,NTA how are you the asshole? For like bigger b...,azofrl,t2_2xfoz1fv,Dark-_-Legacy
530639,ei9gmpk,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,NTA. Your girlfriend is overreacting. You don'...,azofrl,t2_15bdqt5w,Broken_Angel-


In [5]:
import re
from typing import Any
import string

"""
Text preprocessing: lowercase; remove punctuation; remove NTA_KEYWORDS, YTA_KEYWORDS and 'ampx200b', 'x200b', 'AITA', 'aita'
"""

class KeywordsCleaner:
    def __init__(self) -> None:
        # NTA YTA keywords
        NTA_KEYWORDS = ['nta', 'nah', 'you are not the asshole', 'you\'re not the asshole', 'u are not the asshole', 'u re not the asshole', 
                        'you re not the asshole', 'u\'re not the asshole', 'not the asshole', 'not the ah', 'not asshole', 'not ah']
        YTA_KEYWORDS = ['yta', 'you are the asshole', 'you\'re the asshole', 'u are the asshole', 'u re the asshole', 
                        'you re the asshole', 'u\'re the asshole', 'you the ah', 'you the asshole', 'u the asshole', 'u the ah']

        keywords_rep = {'ampx200b': "", 'x200b': "", 'AITA': "", 'aita': ""}
        
        for key in NTA_KEYWORDS + YTA_KEYWORDS:
            keywords_rep[key] = ""
        keywords_rep = dict(sorted(keywords_rep.items(), key=lambda k: len(k[0]), reverse=True))

        self.rep = dict((re.escape(k), v) for k, v in keywords_rep.items())
        self.pattern = re.compile("|".join(self.rep.keys()))

    def __call__(self, text: str) -> str:
        text = self.pattern.sub(lambda m: self.rep[re.escape(m.group(0))], text.lower())
        return text.translate(str.maketrans('', '', string.punctuation))   


In [6]:
"""Example"""
keywordsCleaner = KeywordsCleaner()

print(social_comments["body"].at[530636])
keywordsCleaner(social_comments["body"].at[530636])

NTA &#x200B; but you handled it really poorly, like that was the absolute worst way in which you could have ever said it &#x200B; why not acknowledge that she's beautiful and sexy in her own way? make a lateral move that you know, wouldn't require you to directly say: I'm not attracted to you physically? because that would just open up a can of worms. &#x200B; well, you messed it up so now you have to fix it. &#x200B; You're not an asshole for having a personal body type preference, everyone has but you are an idiot for handling it the way you did, good luck with that.


'  but you handled it really poorly like that was the absolute worst way in which you could have ever said it  why not acknowledge that shes beautiful and sexy in her own way make a lateral move that you know wouldnt require you to directly say im not attracted to you physically because that would just open up a can of worms  well you messed it up so now you have to fix it  youre not an asshole for having a personal body type preference everyone has but you are an idiot for handling it the way you did good luck with that'

In [7]:
"""Filter social comments"""
keywordsCleaner = KeywordsCleaner()

for i, row in social_comments.iterrows():
    row['body'] = keywordsCleaner(row['body'])

social_comments

Unnamed: 0,id,permalink,label,body,parent_id,author_fullname,author_name
13,eamexog,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,uh absolutely these are really really horrid ...,a1311q,t2_2kabg9z7,xormun
14,eameha5,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,ok sweetie no hell no this is not your fault ...,a1311q,t2_1jrodkow,tkPuncake
16,eamjnog,/r/AmItheAsshole/comments/a1311q/aita_for_tell...,NTA,my girlfriend has hypothyroidism and i know t...,a1311q,t2_14ub01,hawkbearpig
17,ef5kbsb,/r/AmItheAsshole/comments/akkcpn/aita_for_not_...,NTA,youre clearly sorry about your homophobic fam...,akkcpn,t2_61b3s,sadsquash
20,ef5l208,/r/AmItheAsshole/comments/akkcpn/aita_for_not_...,NTA,and it will get better i promise youll make i...,akkcpn,t2_xvrsh,SheketBevakaSTFU
...,...,...,...,...,...,...,...
530631,ei9ofvo,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,YTA,a bit towards op you found the worst way to ...,azofrl,t2_16fctm,xAlois
530636,ei9gkon,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,but you handled it really poorly like that w...,azofrl,t2_dk4gojr,YoungDiscord
530637,ei9gl79,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,how are for like bigger boobs,azofrl,t2_2xfoz1fv,Dark-_-Legacy
530639,ei9gmpk,/r/AmItheAsshole/comments/azofrl/aita_for_not_...,NTA,your girlfriend is overreacting you dont have...,azofrl,t2_15bdqt5w,Broken_Angel-


In [8]:
"""From pandas to Huggingface Dataset"""
from datasets import Dataset

social_comments_dataset = Dataset.from_pandas(social_comments)
social_comments_dataset

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__'],
    num_rows: 212687
})

In [9]:
"""Analyze the data"""
data_distribution = {'NTA': 0, 'YTA': 0}

def compute_data_distribution(example):
    data_distribution[example['label']] += 1

social_comments_dataset.map(compute_data_distribution)

print(data_distribution)
print(f"NTA: {data_distribution['NTA'] / sum(data_distribution.values())} ")
print(f"YTA: {data_distribution['YTA'] / sum(data_distribution.values())} ")

Map: 100%|██████████| 212687/212687 [00:09<00:00, 21686.95 examples/s]

{'NTA': 150040, 'YTA': 62647}
NTA: 0.7054497924179663 
YTA: 0.2945502075820337 





In [10]:
from datasets import DatasetDict

"""80-10-10 split"""
# 80% train, 20% test + validation
train_testvalid = social_comments_dataset.train_test_split(test_size=0.2, seed=SEED)
# Split the 20% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=SEED)

social_comments_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'val': test_valid['train']})

social_comments_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['id', 'permalink', 'label', 'body', 'parent_id', 'author_fullname', 'author_name', '__index_level_0__'],
        num_rows: 21269
    })
})

In [11]:
id2label = {0: "NTA", 1: "YTA"}

label2id = {"NTA": 0, "YTA": 1}

# social-chemestry-101 Dataset

In [12]:
from datasets import load_dataset

social_chemestry_dataset = load_dataset("metaeval/social-chemestry-101")
social_chemestry_dataset

DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [13]:
from collections import defaultdict

situationId_to_situation = {}
situationId_to_ROT_moral_foundations = defaultdict(set)
situationId_to_ROT_categories = defaultdict(set)

In [14]:
def mapping_from_situationId(example):
    situation_id = example['situation-short-id'].split("/")[-1]

    situationId_to_situation[situation_id] = example['situation']
    if example['rot-moral-foundations'] is not None:
        situationId_to_ROT_moral_foundations[situation_id].add(example['rot-moral-foundations'])
    if example['rot-categorization'] is not None:
        situationId_to_ROT_categories[situation_id].add(example['rot-categorization'])

social_chemestry_dataset.map(mapping_from_situationId)

Map: 100%|██████████| 355922/355922 [00:45<00:00, 7769.94 examples/s]


DatasetDict({
    train: Dataset({
        features: ['area', 'm', 'split', 'rot-agree', 'rot-categorization', 'rot-moral-foundations', 'rot-char-targeting', 'rot-bad', 'rot-judgment', 'action', 'action-agency', 'action-moral-judgment', 'action-agree', 'action-legal', 'action-pressure', 'action-char-involved', 'action-hypothetical', 'situation', 'situation-short-id', 'rot', 'rot-id', 'rot-worker-id', 'breakdown-worker-id', 'n-characters', 'characters'],
        num_rows: 355922
    })
})

In [15]:
situationId_to_ROT_moral_foundations['adwxny']

{'care-harm|fairness-cheating',
 'care-harm|loyalty-betrayal',
 'fairness-cheating',
 'loyalty-betrayal'}

In [16]:
'. '.join(situationId_to_ROT_moral_foundations['adwxny'])

'loyalty-betrayal. fairness-cheating. care-harm|fairness-cheating. care-harm|loyalty-betrayal'

# Tokenizations

### Tokenization for p(y|c) -> Only comments texts (filtered from NTA/YTA tags)

In [17]:
from transformers import AutoTokenizer

"""Tokenize the data"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data_only_comments(example):
    encoding = tokenizer(example['body'], padding="max_length", truncation=True)
    encoding['labels'] = label2id[ example['label'] ]

    return encoding


### Tokenization for p(y|c,s) -> Situation Text + tokenizer.sep_token + comments text

In [18]:
from transformers import AutoTokenizer

"""Tokenize the data"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data_situations_comments(example):
    situation = situationId_to_situation[example['parent_id']]
    
    encoding = tokenizer(situation, example['body'], padding="max_length", truncation=True)
    encoding['labels'] = label2id[ example['label'] ]

    return encoding


### Tokenization for p(y|c,rot) -> Rot-moral-foundations +  tokenizer.sep_token + comments text 

In [17]:
from transformers import AutoTokenizer

"""Tokenize the data"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data_moralFoundations_comments(example):
    rots = '. '.join(situationId_to_ROT_moral_foundations[example['parent_id']])
    
    encoding = tokenizer(rots, example['body'], padding="max_length", truncation=True)
    encoding['labels'] = label2id[ example['label'] ]

    return encoding


### Tokenization for p(y|c,rot) -> Rot-categories +  tokenizer.sep_token + comments text 

In [17]:
from transformers import AutoTokenizer

"""Tokenize the data"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data_categories_comments(example):
    rots = '. '.join(situationId_to_ROT_categories[example['parent_id']])
    
    encoding = tokenizer(rots, example['body'], padding="max_length", truncation=True)
    encoding['labels'] = label2id[ example['label'] ]

    return encoding


### Tokenization for p(y|c,a) -> Author ID + Comment text

In [17]:
from transformers import AutoTokenizer

"""Tokenize the data"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data_authorId_comments(example):

    text = example['id'] + ". " + example['body'] 

    encoding = tokenizer(text, padding="max_length", truncation=True)
    encoding['labels'] = label2id[ example['label'] ]

    return encoding

### Tokenization for p(y|c,s,a) -> situation + tokenizer.sep_token + (Author ID + Comment text)

In [17]:
from transformers import AutoTokenizer

"""Tokenize the data"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data_situation_authorId_comments(example):
    situation = situationId_to_situation[example['parent_id']]
    text = example['id'] + ". " + example['body'] 

    encoding = tokenizer(text, situation, padding="max_length", truncation=True)
    encoding['labels'] = label2id[ example['label'] ]

    return encoding

# Training

In [18]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
"""Evaluation"""
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1", average='macro')

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    accuracy_score = accuracy.compute(predictions=predictions, references=labels)
    precision_score = precision.compute(predictions=predictions, references=labels)
    recall_score = recall.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels)
    
    return {
        "precision": precision_score['precision'],
        "recall": recall_score['recall'],
        "f1": f1_score['f1'],
        "accuracy": accuracy_score['accuracy'],
    }

## BERT model for p(y|c) -> only comments

In [16]:
"""Tokenize the dataset"""
tokenized_dataset = social_comments_dataset.map(tokenize_data_only_comments)

tokenized_dataset = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
tokenized_dataset

Map:   0%|          | 454/170149 [00:00<01:51, 1521.44 examples/s]

Map: 100%|██████████| 170149/170149 [01:56<00:00, 1459.09 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1410.60 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1406.97 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
})

In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="output_social_norms/bert_comments_classification_custom_trainer",
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_strategy="steps",
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [27]:
import torch

"""
We have 119945 training samples with label 0 (NTA) and 50204 training samples with label 1 (YTA). 
A clear disproportion, thats why we need to assign different weights to the labels when computing the loss. 
The label with the least amount of data (label 1 in our case), should have the higher weight.
"""

def get_samples_per_class(labels):
    return torch.bincount(labels)

samples_per_class_train = get_samples_per_class(torch.Tensor(tokenized_dataset["train"]['labels']).int())
print(f"{samples_per_class_train[0]} training samples have label 0 (NTA), {samples_per_class_train[1]} training samples have label 1 (YTA)")

total_training_samples = samples_per_class_train.sum()
weights = torch.Tensor( [samples_per_class_train[1]/total_training_samples, samples_per_class_train[0]/total_training_samples] )
weights

119945 training samples have label 0 (NTA), 50204 training samples have label 1 (YTA)


tensor([0.2951, 0.7049])

In [28]:
from torch import nn
from transformers import Trainer


"""Create Custom Trainer to assign different weights to classes when computing the loss due to the large data imbalance"""
class CustomTrainer(Trainer):
    #override the compute_loss function of the Trainer class
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss 
        loss_fct = nn.CrossEntropyLoss(weight=weights.to(model.module.device)) # weights is a 1D Tensor assigning weight to each of the classes
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [29]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [30]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.4555,0.419685,0.621259,0.785004,0.693598,0.799944
2000,0.3644,0.39655,0.671423,0.782396,0.722674,0.82679
3000,0.3247,0.389822,0.636352,0.840587,0.724349,0.815459


TrainOutput(global_step=3990, training_loss=0.3838732994289924, metrics={'train_runtime': 6818.1287, 'train_samples_per_second': 74.866, 'train_steps_per_second': 0.585, 'total_flos': 6.761758624175923e+16, 'train_loss': 0.3838732994289924, 'epoch': 3.0})

In [31]:
trainer.save_model()

In [32]:
"""Testing"""
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import classification_report

model_path = "/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_custom_trainer"

finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

"""Predict"""
predictions_output = trainer.predict(tokenized_dataset['test'])


preds = np.argmax(predictions_output.predictions, axis=-1)
labels = predictions_output.label_ids
print(classification_report(y_true=labels, y_pred=preds))



              precision    recall  f1-score   support

           0       0.93      0.80      0.86     14961
           1       0.64      0.85      0.73      6308

    accuracy                           0.82     21269
   macro avg       0.78      0.83      0.80     21269
weighted avg       0.84      0.82      0.82     21269



## BERT model for p(y|c,s) -> situations and comments

In [21]:
"""Tokenize the data"""
tokenized_dataset = social_comments_dataset.map(tokenize_data_situations_comments)

tokenized_dataset = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
tokenized_dataset

Map: 100%|██████████| 170149/170149 [02:11<00:00, 1289.75 examples/s]
Map: 100%|██████████| 21269/21269 [00:16<00:00, 1262.53 examples/s]
Map: 100%|██████████| 21269/21269 [00:16<00:00, 1254.75 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
})

In [22]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="output_social_norms/bert_comments_classification_situations_and_comments_custom_trainer",
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_strategy="steps",
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [23]:
import torch

"""
We have 119945 training samples with label 0 (NTA) and 50204 training samples with label 1 (YTA). 
A clear disproportion, thats why we need to assign different weights to the labels when computing the loss. 
The label with the least amount of data (label 1 in our case), should have the higher weight.
"""

def get_samples_per_class(labels):
    return torch.bincount(labels)

samples_per_class_train = get_samples_per_class(torch.Tensor(tokenized_dataset["train"]['labels']).int())
print(f"{samples_per_class_train[0]} training samples have label 0 (NTA), {samples_per_class_train[1]} training samples have label 1 (YTA)")

total_training_samples = samples_per_class_train.sum()
weights = torch.Tensor( [samples_per_class_train[1]/total_training_samples, samples_per_class_train[0]/total_training_samples] )
weights

119945 training samples have label 0 (NTA), 50204 training samples have label 1 (YTA)


tensor([0.2951, 0.7049])

In [24]:
from torch import nn
from transformers import Trainer


"""Create Custom Trainer to assign different weights to classes when computing the loss due to the large data imbalance"""
class CustomTrainer(Trainer):
    #override the compute_loss function of the Trainer class
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss 
        loss_fct = nn.CrossEntropyLoss(weight=weights.to(model.module.device)) # weights is a 1D Tensor assigning weight to each of the classes
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [25]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [26]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.3977,0.338625,0.717486,0.835371,0.771954,0.857633
2000,0.2908,0.303273,0.786898,0.836023,0.810717,0.887395
3000,0.2435,0.286675,0.761999,0.882478,0.817825,0.886596


TrainOutput(global_step=3990, training_loss=0.3152834096349272, metrics={'train_runtime': 6785.8115, 'train_samples_per_second': 75.223, 'train_steps_per_second': 0.588, 'total_flos': 6.761758624175923e+16, 'train_loss': 0.3152834096349272, 'epoch': 3.0})

In [27]:
trainer.save_model()

In [28]:
"""Testing"""
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import classification_report

model_path = "/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_and_comments_custom_trainer"

finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

"""Predict"""
predictions_output = trainer.predict(tokenized_dataset['test'])


preds = np.argmax(predictions_output.predictions, axis=-1)
labels = predictions_output.label_ids
print(classification_report(y_true=labels, y_pred=preds))



              precision    recall  f1-score   support

           0       0.95      0.88      0.91     14961
           1       0.76      0.88      0.82      6308

    accuracy                           0.88     21269
   macro avg       0.85      0.88      0.86     21269
weighted avg       0.89      0.88      0.88     21269



## BERT model for p(y|c,rot) -> ROT-moral_foundations and comments

In [20]:
"""Tokenize the data"""
tokenized_dataset = social_comments_dataset.map(tokenize_data_moralFoundations_comments)

tokenized_dataset = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
tokenized_dataset

Map: 100%|██████████| 170149/170149 [02:07<00:00, 1332.71 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1368.72 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1355.07 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
})

In [21]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="output_social_norms/bert_comments_classification_rot-moralFoundations_and_comments_custom_trainer",
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_strategy="steps",
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [22]:
import torch

"""
We have 119945 training samples with label 0 (NTA) and 50204 training samples with label 1 (YTA). 
A clear disproportion, thats why we need to assign different weights to the labels when computing the loss. 
The label with the least amount of data (label 1 in our case), should have the higher weight.
"""

def get_samples_per_class(labels):
    return torch.bincount(labels)

samples_per_class_train = get_samples_per_class(torch.Tensor(tokenized_dataset["train"]['labels']).int())
print(f"{samples_per_class_train[0]} training samples have label 0 (NTA), {samples_per_class_train[1]} training samples have label 1 (YTA)")

total_training_samples = samples_per_class_train.sum()
weights = torch.Tensor( [samples_per_class_train[1]/total_training_samples, samples_per_class_train[0]/total_training_samples] )
weights

119945 training samples have label 0 (NTA), 50204 training samples have label 1 (YTA)


tensor([0.2951, 0.7049])

In [23]:
from torch import nn
from transformers import Trainer


"""Create Custom Trainer to assign different weights to classes when computing the loss due to the large data imbalance"""
class CustomTrainer(Trainer):
    #override the compute_loss function of the Trainer class
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss 
        loss_fct = nn.CrossEntropyLoss(weight=weights.to(model.module.device)) # weights is a 1D Tensor assigning weight to each of the classes
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [24]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [25]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.4563,0.416587,0.644847,0.765933,0.700194,0.810804
2000,0.3647,0.389641,0.669064,0.803749,0.730248,0.828718
3000,0.3224,0.374641,0.663636,0.832926,0.738706,0.830034


TrainOutput(global_step=3990, training_loss=0.38457842375102796, metrics={'train_runtime': 6808.1265, 'train_samples_per_second': 74.976, 'train_steps_per_second': 0.586, 'total_flos': 6.761758624175923e+16, 'train_loss': 0.38457842375102796, 'epoch': 3.0})

In [26]:
trainer.save_model()

In [27]:
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import classification_report

model_path = "/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-moralFoundations_and_comments_custom_trainer"

finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

"""Predict"""
predictions_output = trainer.predict(tokenized_dataset['test'])


preds = np.argmax(predictions_output.predictions, axis=-1)
labels = predictions_output.label_ids
print(classification_report(y_true=labels, y_pred=preds))



              precision    recall  f1-score   support

           0       0.92      0.83      0.87     14961
           1       0.67      0.84      0.74      6308

    accuracy                           0.83     21269
   macro avg       0.80      0.83      0.81     21269
weighted avg       0.85      0.83      0.83     21269



## BERT model for p(y|c,rot) -> ROT-categories and comments

In [20]:
"""Tokenize the data"""
tokenized_dataset = social_comments_dataset.map(tokenize_data_categories_comments)

tokenized_dataset = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
tokenized_dataset

Map: 100%|██████████| 170149/170149 [02:12<00:00, 1283.24 examples/s]
Map: 100%|██████████| 21269/21269 [00:16<00:00, 1291.01 examples/s]
Map: 100%|██████████| 21269/21269 [00:17<00:00, 1250.76 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
})

In [21]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="output_social_norms/bert_comments_classification_rot-categories_and_comments_custom_trainer",
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_strategy="steps",
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [22]:
import torch

"""
We have 119945 training samples with label 0 (NTA) and 50204 training samples with label 1 (YTA). 
A clear disproportion, thats why we need to assign different weights to the labels when computing the loss. 
The label with the least amount of data (label 1 in our case), should have the higher weight.
"""

def get_samples_per_class(labels):
    return torch.bincount(labels)

samples_per_class_train = get_samples_per_class(torch.Tensor(tokenized_dataset["train"]['labels']).int())
print(f"{samples_per_class_train[0]} training samples have label 0 (NTA), {samples_per_class_train[1]} training samples have label 1 (YTA)")

total_training_samples = samples_per_class_train.sum()
weights = torch.Tensor( [samples_per_class_train[1]/total_training_samples, samples_per_class_train[0]/total_training_samples] )
weights

119945 training samples have label 0 (NTA), 50204 training samples have label 1 (YTA)


tensor([0.2951, 0.7049])

In [23]:
from torch import nn
from transformers import Trainer


"""Create Custom Trainer to assign different weights to classes when computing the loss due to the large data imbalance"""
class CustomTrainer(Trainer):
    #override the compute_loss function of the Trainer class
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss 
        loss_fct = nn.CrossEntropyLoss(weight=weights.to(model.module.device)) # weights is a 1D Tensor assigning weight to each of the classes
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [24]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [25]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.454,0.415414,0.636038,0.781907,0.70147,0.80803
2000,0.3624,0.388972,0.678642,0.788264,0.729357,0.831257
3000,0.3237,0.382069,0.658092,0.829829,0.734049,0.826555


TrainOutput(global_step=3990, training_loss=0.38267225298965185, metrics={'train_runtime': 6832.7009, 'train_samples_per_second': 74.706, 'train_steps_per_second': 0.584, 'total_flos': 6.761758624175923e+16, 'train_loss': 0.38267225298965185, 'epoch': 3.0})

In [26]:
trainer.save_model()

In [27]:
"""Testing"""

from transformers import AutoModelForSequenceClassification

model_path = "/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-categories_and_comments_custom_trainer"

finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [28]:
predictions_output = trainer.predict(tokenized_dataset['test'])
predictions_output



PredictionOutput(predictions=array([[-2.0028272 ,  2.348375  ],
       [-2.521682  ,  2.969072  ],
       [ 3.3939176 , -3.3149705 ],
       ...,
       [ 2.5599544 , -2.472094  ],
       [ 2.024031  , -2.026351  ],
       [ 0.4161413 , -0.27432227]], dtype=float32), label_ids=array([0, 1, 0, ..., 0, 0, 0]), metrics={'test_loss': 0.38528215885162354, 'test_precision': 0.6633863234923039, 'test_recall': 0.8335447051363348, 'test_f1': 0.7387944358578052, 'test_accuracy': 0.8251915933988434, 'test_runtime': 110.4458, 'test_samples_per_second': 192.574, 'test_steps_per_second': 1.512})

In [29]:
from sklearn.metrics import classification_report

preds = np.argmax(predictions_output.predictions, axis=-1)
labels = predictions_output.label_ids
print(classification_report(y_true=labels, y_pred=preds))

              precision    recall  f1-score   support

           0       0.92      0.82      0.87     14961
           1       0.66      0.83      0.74      6308

    accuracy                           0.83     21269
   macro avg       0.79      0.83      0.80     21269
weighted avg       0.84      0.83      0.83     21269



## BERT model for p(y|c,a) -> Author ID and Comment text

In [20]:
"""Tokenize the data"""
tokenized_dataset = social_comments_dataset.map(tokenize_data_authorId_comments)

tokenized_dataset = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
tokenized_dataset

Map: 100%|██████████| 170149/170149 [02:14<00:00, 1266.52 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1348.63 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1384.82 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
})

In [21]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="output_social_norms/bert_comments_classification_authorId_and_comments_custom_trainer",
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_strategy="steps",
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [22]:
import torch

"""
We have 119945 training samples with label 0 (NTA) and 50204 training samples with label 1 (YTA). 
A clear disproportion, thats why we need to assign different weights to the labels when computing the loss. 
The label with the least amount of data (label 1 in our case), should have the higher weight.
"""

def get_samples_per_class(labels):
    return torch.bincount(labels)

samples_per_class_train = get_samples_per_class(torch.Tensor(tokenized_dataset["train"]['labels']).int())
print(f"{samples_per_class_train[0]} training samples have label 0 (NTA), {samples_per_class_train[1]} training samples have label 1 (YTA)")

total_training_samples = samples_per_class_train.sum()
weights = torch.Tensor( [samples_per_class_train[1]/total_training_samples, samples_per_class_train[0]/total_training_samples] )
weights

119945 training samples have label 0 (NTA), 50204 training samples have label 1 (YTA)


tensor([0.2951, 0.7049])

In [23]:
from torch import nn
from transformers import Trainer


"""Create Custom Trainer to assign different weights to classes when computing the loss due to the large data imbalance"""
class CustomTrainer(Trainer):
    #override the compute_loss function of the Trainer class
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss 
        loss_fct = nn.CrossEntropyLoss(weight=weights.to(model.module.device)) # weights is a 1D Tensor assigning weight to each of the classes
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [24]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [25]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.4602,0.419288,0.615029,0.803097,0.696593,0.798204
2000,0.3669,0.395703,0.693018,0.770171,0.729561,0.8353
3000,0.3234,0.378603,0.6454,0.839283,0.729682,0.820631


TrainOutput(global_step=3990, training_loss=0.3858281453450521, metrics={'train_runtime': 6859.5815, 'train_samples_per_second': 74.414, 'train_steps_per_second': 0.582, 'total_flos': 6.761758624175923e+16, 'train_loss': 0.3858281453450521, 'epoch': 3.0})

In [26]:
trainer.save_model()

In [27]:
"""Testing"""
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import classification_report

model_path = "/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_authorId_and_comments_custom_trainer"

finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

"""Predict"""
predictions_output = trainer.predict(tokenized_dataset['test'])


preds = np.argmax(predictions_output.predictions, axis=-1)
labels = predictions_output.label_ids
print(classification_report(y_true=labels, y_pred=preds))



              precision    recall  f1-score   support

           0       0.93      0.81      0.86     14961
           1       0.65      0.85      0.74      6308

    accuracy                           0.82     21269
   macro avg       0.79      0.83      0.80     21269
weighted avg       0.84      0.82      0.82     21269



## BERT model for p(y|c,s,a) -> situation and Author ID and Comment text

In [20]:
"""Tokenize the data"""
tokenized_dataset = social_comments_dataset.map(tokenize_data_situation_authorId_comments)

tokenized_dataset = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
tokenized_dataset

Map: 100%|██████████| 170149/170149 [02:09<00:00, 1318.47 examples/s]
Map: 100%|██████████| 21269/21269 [00:15<00:00, 1336.26 examples/s]
Map: 100%|██████████| 21269/21269 [00:16<00:00, 1292.81 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 170149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21269
    })
})

In [21]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="output_social_norms/bert_comments_classification_situations_authorId_and_comments_custom_trainer",
    learning_rate=2e-5,
    #auto_find_batch_size=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_strategy="steps",
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    push_to_hub=False
)

In [22]:
import torch

"""
We have 119945 training samples with label 0 (NTA) and 50204 training samples with label 1 (YTA). 
A clear disproportion, thats why we need to assign different weights to the labels when computing the loss. 
The label with the least amount of data (label 1 in our case), should have the higher weight.
"""

def get_samples_per_class(labels):
    return torch.bincount(labels)

samples_per_class_train = get_samples_per_class(torch.Tensor(tokenized_dataset["train"]['labels']).int())
print(f"{samples_per_class_train[0]} training samples have label 0 (NTA), {samples_per_class_train[1]} training samples have label 1 (YTA)")

total_training_samples = samples_per_class_train.sum()
weights = torch.Tensor( [samples_per_class_train[1]/total_training_samples, samples_per_class_train[0]/total_training_samples] )
weights

119945 training samples have label 0 (NTA), 50204 training samples have label 1 (YTA)


tensor([0.2951, 0.7049])

In [23]:
from torch import nn
from transformers import Trainer

"""Create Custom Trainer to assign different weights to classes when computing the loss due to the large data imbalance"""
class CustomTrainer(Trainer):
    #override the compute_loss function of the Trainer class
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss 
        loss_fct = nn.CrossEntropyLoss(weight=weights.to(model.module.device)) # weights is a 1D Tensor assigning weight to each of the classes
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [24]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [25]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mgjergjplepi12[0m ([33msocial-chem-101-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.4065,0.346978,0.704694,0.824613,0.759952,0.849734
2000,0.2964,0.30489,0.760704,0.84564,0.800926,0.878744
3000,0.2485,0.295045,0.721679,0.902363,0.80197,0.871456


TrainOutput(global_step=3990, training_loss=0.32085349099677907, metrics={'train_runtime': 6897.4579, 'train_samples_per_second': 74.005, 'train_steps_per_second': 0.578, 'total_flos': 6.761758624175923e+16, 'train_loss': 0.32085349099677907, 'epoch': 3.0})

In [26]:
trainer.save_model()

In [27]:
"""Testing"""
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import classification_report

model_path = "/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_authorId_and_comments_custom_trainer"

finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

"""Predict"""
predictions_output = trainer.predict(tokenized_dataset['test'])


preds = np.argmax(predictions_output.predictions, axis=-1)
labels = predictions_output.label_ids
print(classification_report(y_true=labels, y_pred=preds))



              precision    recall  f1-score   support

           0       0.95      0.85      0.90     14961
           1       0.72      0.90      0.80      6308

    accuracy                           0.87     21269
   macro avg       0.84      0.88      0.85     21269
weighted avg       0.88      0.87      0.87     21269



# Inference

In [33]:
"""Only comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_custom_trainer'

nta_comment = "If it's for your best, than do not worry about it."
yta_comment = "You shouldn't have done that, it's not allowed."
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(nta_comment)
print(f"{nta_comment} -> {answer}")

answer = clf(yta_comment)
print(f"{yta_comment} -> {answer}")

If it's for your best, than do not worry about it. -> [{'label': 'NTA', 'score': 0.9414990544319153}]
You shouldn't have done that, it's not allowed. -> [{'label': 'YTA', 'score': 0.8737062811851501}]


Observe the large increace in certainty when predicting a YTA label!

In [31]:
"""Situations and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_and_comments_custom_trainer/checkpoint-1000/'

situation = "Want to study abroad, but feel bad leaving my country."
nta_comment = "If it's for your best, than do not worry about it."
yta_comment = "You shouldn't have done that, it's not good behaviour."
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

print(f"The situations is: {situation}")
print("The comments:")

answer = clf(situation + " " + tokenizer.sep_token + " " + nta_comment)
print(f"{nta_comment} -> {answer}")

answer = clf(situation + " " + tokenizer.sep_token + " " + nta_comment)
print(f"{yta_comment} -> {answer}")

The situations is: Want to study abroad, but feel bad leaving my country.
The comments:
If it's for your best, than do not worry about it. -> [{'label': 'NTA', 'score': 0.9377086162567139}]
You shouldn't have done that, it's not good behaviour. -> [{'label': 'NTA', 'score': 0.9377086162567139}]


In [30]:
"""rot-moralFoundations and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-moralFoundations_and_comments_custom_trainer'

rot_moralFoundation = 'loyalty-betrayal'
nta_comment = "If it's for your best, than do not worry about it."
yta_comment = "You shouldn't have done that, it's not allowed."
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(rot_moralFoundation + " " + tokenizer.sep_token + " " + nta_comment)
print(f"{nta_comment} -> {answer}")

answer = clf(rot_moralFoundation + " " + tokenizer.sep_token + " " + yta_comment)
print(f"{yta_comment} -> {answer}")

If it's for your best, than do not worry about it. -> [{'label': 'NTA', 'score': 0.9722619652748108}]
You shouldn't have done that, it's not allowed. -> [{'label': 'YTA', 'score': 0.8665910363197327}]


In [31]:
"""rot-categories and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_rot-categories_and_comments_custom_trainer'

rot_category = 'morality-ethics'
nta_comment = "If it's for your best, than do not worry about it."
yta_comment = "You shouldn't have done that, it's not allowed."
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(rot_category + " " + tokenizer.sep_token + " " + nta_comment)
print(f"{nta_comment} -> {answer}")

answer = clf(rot_category + " " + tokenizer.sep_token + " " + yta_comment)
print(f"{yta_comment} -> {answer}")

If it's for your best, than do not worry about it. -> [{'label': 'NTA', 'score': 0.9723081588745117}]
You shouldn't have done that, it's not allowed. -> [{'label': 'YTA', 'score': 0.8744572401046753}]


In [28]:
"""AuthorId and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_authorId_and_comments_custom_trainer'

authorId = 'ei9ofvo'
nta_comment = "If it's for your best, than do not worry about it."
yta_comment = "You shouldn't have done that, it's not allowed."
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf(authorId + ". " + nta_comment)
print(f"{nta_comment} -> {answer}")

answer = clf(authorId + ". " + yta_comment)
print(f"{yta_comment} -> {answer}")

If it's for your best, than do not worry about it. -> [{'label': 'NTA', 'score': 0.9316185116767883}]
You shouldn't have done that, it's not allowed. -> [{'label': 'YTA', 'score': 0.8692306876182556}]


In [1]:
"""Situations and AuthorId and comments"""

from transformers import AutoTokenizer, pipeline

model_path = '/home/IAIS/gplepi/entero/output_social_norms/bert_comments_classification_situations_authorId_and_comments_custom_trainer'

authorId = 'ei9ofvo'
situation = "Want to study abroad, but feel bad leaving my country."

nta_comment = "If it's for your best, than do not worry about it."
yta_comment = "You shouldn't have done that, it's not allowed."
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

clf = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

answer = clf( situation + " " + tokenizer.sep_token + " " + (authorId + ". " + nta_comment) )
print(f"{nta_comment} -> {answer}")

answer = clf( situation + " " + tokenizer.sep_token + " " +  (authorId + ". " + yta_comment) )
print(f"{yta_comment} -> {answer}")

  from .autonotebook import tqdm as notebook_tqdm


If it's for your best, than do not worry about it. -> [{'label': 'NTA', 'score': 0.9864906072616577}]
You shouldn't have done that, it's not allowed. -> [{'label': 'NTA', 'score': 0.8591297268867493}]
