In [4]:
# https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

In [5]:
!ls moral-judgement-reddit-whmd-2021/

comments_for_bert_no_etiquettes_split1.csv
comments_for_bert_no_etiquettes_split2.csv


In [6]:
# https://github.com/CLArg-group/moral-judgement-reddit-whmd-2021/tree/main
df = pd.read_csv("moral-judgement-reddit-whmd-2021/comments_for_bert_no_etiquettes_split1.csv")

In [7]:
df.head()

Unnamed: 0,id,score,body,verdict,is_asshole
0,cg08970,7,I've been on the receiving end of this before ...,NTA,0
1,cgdobj3,8,What if there were no arseholes? These guys wa...,NTA,0
2,cgcxbgb,11,You're not an asshole in my eyes. You did pay ...,NTA,0
3,cgcfv1n,4,I was going to say you are but then I read u/m...,NTA,0
4,cgcdfh0,4,Unless you are grossly exaggerating the value ...,NTA,0


In [8]:
len(df)

306470

In [9]:
df.is_asshole.value_counts()

is_asshole
0    202232
1    104238
Name: count, dtype: int64

In [10]:
# Read data

# https://github.com/CLArg-group/moral-judgement-reddit-whmd-2021/tree/main
df = pd.read_csv("moral-judgement-reddit-whmd-2021/comments_for_bert_no_etiquettes_split1.csv")

# take 1000 records
df_1 = df.loc[df.is_asshole == 0][:1000]
df_0 = df.loc[df.is_asshole == 1][:1000]
df_small = pd.concat([df_1, df_0])

In [11]:
# Define pretrained tokenizer and model
model_name = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# ----- 1. Preprocess data -----#
# Preprocess data
X = list(df_small["body"])
y = list(df_small["is_asshole"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [13]:
# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Define Trainer
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [76]:
# UNCOMMENT TO TRAIN

# # Train pre-trained model
# # took 2h20m
# trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.3652,0.505111,0.8675,0.912088,0.817734,0.862338


TrainOutput(global_step=600, training_loss=0.33042333761850995, metrics={'train_runtime': 8295.9373, 'train_samples_per_second': 0.579, 'train_steps_per_second': 0.072, 'total_flos': 1262933065728000.0, 'train_loss': 0.33042333761850995, 'epoch': 3.0})

In [14]:
!ls -alh output/checkpoint-500/

total 2548192
drwxr-xr-x  9 thama  staff   288B  4 29 13:53 [34m.[m[m/
drwxr-xr-x  3 thama  staff    96B  4 29 13:53 [34m..[m[m/
-rw-r--r--  1 thama  staff   725B  4 29 13:53 config.json
-rw-r--r--  1 thama  staff   413M  4 29 13:53 model.safetensors
-rw-r--r--  1 thama  staff   827M  4 29 13:53 optimizer.pt
-rw-r--r--  1 thama  staff    14K  4 29 13:53 rng_state.pth
-rw-r--r--  1 thama  staff   1.0K  4 29 13:53 scheduler.pt
-rw-r--r--  1 thama  staff   1.0K  4 29 13:53 trainer_state.json
-rw-r--r--  1 thama  staff   4.8K  4 29 13:53 training_args.bin


In [15]:
# UNCOMMENT TO LOAD TRAINED MODEL

# args = TrainingArguments(
#     output_dir="output",          # change this to the path where trained data stored
#     evaluation_strategy="steps",
#     eval_steps=500,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     seed=0,
#     load_best_model_at_end=True,
# )
# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     compute_metrics=compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
# )

# # Load trained model
# model_path = "output/checkpoint-500"
# model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

# # Define test trainer
# test_trainer = Trainer(model)

In [16]:
# ----- 3. Predict (for train data) -----#
# took 4min

# Make prediction for train data
raw_pred, _, _ = trainer.predict(val_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

# prediction
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [17]:
# answer
np.array(y_val)

array([1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,

In [18]:
# ----- 3. Predict (for test data which is not used for train)-----#

# create test data
df_test = pd.concat([df.loc[df.is_asshole == 0][1000:1100], df.loc[df.is_asshole == 1][1000:1100]])

X_test = list(df_test["body"])
y_test = list(df_test["is_asshole"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)
test_dataset = Dataset(X_test_tokenized, y_test)

In [19]:
# took 2min
# Make prediction
raw_pred, _, _ = trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [20]:
X_test[4]

"Your family are totally failing to see the advantage of having a vegan in the family. At my family get togethers I'm on salads and veg, cater for everyone's allergies, and they're all super pleased. I even do sugar free raw cake for diabetic grandmama.\n\nI don't think it would hurt to sit out one or two and engage in the next when you're feeling ready. "