In [59]:
# https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

from sklearn.metrics import classification_report

In [23]:
df = pd.read_parquet("submissions.parquet")

In [28]:
df.head()

Unnamed: 0,link_flair_text,num_comments,over_18,score,url,selftext,title,id,edited,is_self,permalink,downs,ups,created
0,not the asshole,1,False,2,http://www.reddit.com/r/AmItheAsshole/comments...,I work in an office that requires me to wear a...,AItA: I like air conditioning and my coworkers...,1fy0bx,0.0,True,/r/AmItheAsshole/comments/1fy0bx/aita_i_like_a...,0.0,2.0,
1,too close to call,9,False,62,http://www.reddit.com/r/AmItheAsshole/comments...,I have been on a parking structure project for...,[AITA] Construction worker here,1ytr72,0.0,True,/r/AmItheAsshole/comments/1ytr72/aita_construc...,0.0,62.0,1393275000.0
2,asshole,13,False,47,http://www.reddit.com/r/AmItheAsshole/comments...,[Here is the post in question](http://www.redd...,[AITA] I wrote an explanation in TIL and came ...,1ytxov,0.0,True,/r/AmItheAsshole/comments/1ytxov/aita_i_wrote_...,0.0,47.0,1393279000.0
3,asshole,27,False,140,http://www.reddit.com/r/AmItheAsshole/comments...,"My parents are diabetic, morbidly obese, and a...",[AITA] Threw my parent's donuts away,1yu29c,1393291000.0,True,/r/AmItheAsshole/comments/1yu29c/aita_threw_my...,0.0,140.0,1393281000.0
4,nothing happened,7,False,44,http://www.reddit.com/r/AmItheAsshole/comments...,"Relevant Facts:\n\n1) It was a crowded bar, th...",[AITA] I Put My Empty Beer on a Bar Table,1yu41e,0.0,True,/r/AmItheAsshole/comments/1yu41e/aita_i_put_my...,0.0,44.0,1393282000.0


In [29]:
len(df)

2178385

In [31]:
flairs = {
    'Not the A-hole': "NTA",
    'Asshole': "YTA",
#     'No A-holes here': "NAH",
#     'Everyone Sucks': "ESH",
#     'Not enough info': "INFO",
    # 'UPDATE': "?",
    # 'TL;DR': "?",
    'not the a-hole': "NTA",
    # 'POO Mode Activated 💩',
    'asshole': "YTA",
    # '': "?",
    # 'META': "?",
#     'Shitpost': "?",
    'not the asshole': "NTA",
#     'no a--holes here': "NAH",
#     'everyone sucks': "ESH",
#     'too close to call': "?",
#     'not enough info': "INFO",
}

In [32]:
# filter data and label verdicts
df = df.loc[df.link_flair_text.isin(flairs)].copy()
df['verdict'] = df.link_flair_text.map(flairs)
df.verdict.value_counts()

verdict
NTA    430825
YTA    119030
Name: count, dtype: int64

In [44]:
df["is_asshole"] = 0
df.loc[df.verdict == "YTA", "is_asshole"] =1

In [46]:
# make a small dataset

num_samples_per_verdict = 100

df_small = pd.concat([
    df.loc[df.is_asshole == 0].sample(num_samples_per_verdict),
    df.loc[df.is_asshole == 1].sample(num_samples_per_verdict)
])

In [47]:
df_remain = df.loc[~df.index.isin(df_small.index)]

In [48]:
len(df), len(df_small), len(df_remain)

(549855, 200, 549655)

In [49]:
# Define pretrained tokenizer and model
model_name = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
# ----- 1. Preprocess data -----#
# Preprocess data
X = list(df_small["selftext"])
y = list(df_small["is_asshole"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [51]:
# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Define Trainer
args = TrainingArguments(
    output_dir="output-100sample-2labels",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [53]:
%%time
# UNCOMMENT TO TRAIN

# Train pre-trained model
# took 20min for 200 samples (submissions data)
trainer.train()

Step,Training Loss,Validation Loss


CPU times: user 55min 51s, sys: 12min 46s, total: 1h 8min 37s
Wall time: 19min 41s


TrainOutput(global_step=60, training_loss=0.6448036829630533, metrics={'train_runtime': 1180.8583, 'train_samples_per_second': 0.406, 'train_steps_per_second': 0.051, 'total_flos': 128398195015680.0, 'train_loss': 0.6448036829630533, 'epoch': 3.0})

In [15]:
# UNCOMMENT TO LOAD TRAINED MODEL

# args = TrainingArguments(
#     output_dir="output",          # change this to the path where trained data stored
#     evaluation_strategy="steps",
#     eval_steps=500,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     seed=0,
#     load_best_model_at_end=True,
# )
# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     compute_metrics=compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
# )

# # Load trained model
# model_path = "output/checkpoint-500"
# model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

# # Define test trainer
# test_trainer = Trainer(model)

In [54]:
# ----- 3. Predict (for train data) -----#
# took 30sec for 

# Make prediction for train data
raw_pred, _, _ = trainer.predict(val_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

# prediction
y_pred

array([1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1])

In [55]:
# answer
np.array(y_val)

array([0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0])

In [56]:
# ----- 3. Predict (for test data which is not used for train)-----#

# create test data
df_test = pd.concat([
    df.loc[df.is_asshole == 0][num_samples_per_verdict:num_samples_per_verdict*2],
    df.loc[df.is_asshole == 1][num_samples_per_verdict:num_samples_per_verdict*2]
])

X_test = list(df_test["selftext"])
y_test = list(df_test["is_asshole"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)
test_dataset = Dataset(X_test_tokenized, y_test)

In [57]:
# took 1.5min for 200 records
# Make prediction
raw_pred, _, _ = trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

y_pred

array([0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1])

In [64]:
np.array(y_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [61]:
X_test[0]

'So I order a computer case fulfilled through an amazon seller at about 1pm on a thursday.  Base shipping was around $11, "Expedited" is $20.  I want it by the weekend so I grab the "Expedited" option. \n\nOn saturday, I decide to track the package and there still isn\'t tracking info up.  I send them a friendly email to the extent of "hey, could you update the tracking info?"  They do, and it turns out that it was shipped FedEx ground.  When I send them an email asking "What gives, I paid for expedited," they didn\'t sent me anything back.\n\nFinally, about a week after I get the package (7 days after they ship it), I post a review that reflects my displeasure.  Yesterday, they get back to me that if I remove the review, they will refund the excess money I paid for expedited shipping.  \n\nI say I will as long as they tell me what the alternate method for non-expedited shipping is.  They just got back to me and told me that it\'s ground.  They literally charged me $10 for nothing, the

In [65]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.50      0.58       100
           1       0.61      0.77      0.68       100

    accuracy                           0.64       200
   macro avg       0.65      0.64      0.63       200
weighted avg       0.65      0.64      0.63       200

