In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
import re 

In [2]:
def preprocess_text(text):
    text = text.lower()
    # use regex to replace all 'ytr' and 'ntr' with empty spaces 
    text = re.sub(r'yta|nta', '', text)
    # remove all empty spaces 
    # remove all non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)

    text = text.strip()

    #remove extra spaces
    text = re.sub(r'\s+', ' ', text)

    #truncate text to at most 600 characters
    if len(text) > 600:
        text = text[:600]

    return text

In [3]:
#load data and preprocess
data = pd.read_csv("./users_perception/social_comments.csv")
data['body'] = data['body'].apply(preprocess_text)
use_data = data[["label", "body"]].copy()#use only label and body columns

label_mapping = {"NTA" : 0, "YTA":  1}#add a mapping for the labels
use_data["label"] = use_data["label"].map(label_mapping) 

#split the data to 0.6, 0.2, 0.2
train_data, tmp_data = train_test_split(use_data, test_size=0.4, random_state=42)
test_data, val_data = train_test_split(tmp_data, test_size=0.5, random_state=42)

#fuse all subsets into one dataset:
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
val_dataset = Dataset.from_pandas(val_data)

dataset = DatasetDict({'train': train_dataset, 'test': test_dataset, 'val': val_dataset})

In [4]:
from transformers import AutoTokenizer, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")#load pretrained gpt 2
def preprocess(data):
    return tokenizer(data["body"], truncation=True)

tokenizer.padding_side = "left"

#Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token

dataset = dataset.map(preprocess)

Map:   0%|          | 0/127612 [00:00<?, ? examples/s]

Map:   0%|          | 0/42537 [00:00<?, ? examples/s]

Map:   0%|          | 0/42538 [00:00<?, ? examples/s]

In [5]:
id2label = {0: "NTA", 1: "YTA"}
label2id = {"NTA": 0, "YTA": 1}

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, GPT2ForSequenceClassification

model = GPT2ForSequenceClassification.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

#fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
dataset["val"]

Dataset({
    features: ['label', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 42538
})

In [7]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
import numpy as np
f1_scores = []

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis=1)

    #add f1 metric
    f1_score = f1.compute(predictions=predictions, references=labels)
    f1_scores.append(f1_score)

    return accuracy.compute(predictions=predictions, references=labels) 

training_args = TrainingArguments(
    output_dir="gpt2_aita",
    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    #data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



  0%|          | 0/42538 [00:00<?, ?it/s]

{'loss': 0.7656, 'learning_rate': 1.9764916075038793e-05, 'epoch': 0.02}
{'loss': 0.6313, 'learning_rate': 1.952983215007758e-05, 'epoch': 0.05}
{'loss': 0.6322, 'learning_rate': 1.9294748225116368e-05, 'epoch': 0.07}
{'loss': 0.6215, 'learning_rate': 1.9059664300155156e-05, 'epoch': 0.09}
{'loss': 0.6069, 'learning_rate': 1.8824580375193947e-05, 'epoch': 0.12}
{'loss': 0.5987, 'learning_rate': 1.8589496450232734e-05, 'epoch': 0.14}
{'loss': 0.596, 'learning_rate': 1.8354412525271522e-05, 'epoch': 0.16}
{'loss': 0.5886, 'learning_rate': 1.8119328600310313e-05, 'epoch': 0.19}
{'loss': 0.5804, 'learning_rate': 1.78842446753491e-05, 'epoch': 0.21}
{'loss': 0.5846, 'learning_rate': 1.764916075038789e-05, 'epoch': 0.24}
{'loss': 0.5916, 'learning_rate': 1.741407682542668e-05, 'epoch': 0.26}
{'loss': 0.5792, 'learning_rate': 1.7178992900465467e-05, 'epoch': 0.28}
{'loss': 0.574, 'learning_rate': 1.6943908975504258e-05, 'epoch': 0.31}
{'loss': 0.5729, 'learning_rate': 1.6708825050543046e-05, 

  0%|          | 0/7090 [00:00<?, ?it/s]

{'eval_loss': 0.4974004626274109, 'eval_accuracy': 0.7702470790135647, 'eval_runtime': 91.4257, 'eval_samples_per_second': 465.263, 'eval_steps_per_second': 77.549, 'epoch': 1.0}
{'loss': 0.5165, 'learning_rate': 9.891391226667921e-06, 'epoch': 1.01}
{'loss': 0.5017, 'learning_rate': 9.65630730170671e-06, 'epoch': 1.03}
{'loss': 0.5128, 'learning_rate': 9.421223376745498e-06, 'epoch': 1.06}
{'loss': 0.4945, 'learning_rate': 9.186139451784287e-06, 'epoch': 1.08}
{'loss': 0.4982, 'learning_rate': 8.951055526823077e-06, 'epoch': 1.1}
{'loss': 0.4971, 'learning_rate': 8.715971601861866e-06, 'epoch': 1.13}
{'loss': 0.484, 'learning_rate': 8.480887676900654e-06, 'epoch': 1.15}
{'loss': 0.5148, 'learning_rate': 8.245803751939443e-06, 'epoch': 1.18}
{'loss': 0.5104, 'learning_rate': 8.010719826978232e-06, 'epoch': 1.2}
{'loss': 0.4905, 'learning_rate': 7.77563590201702e-06, 'epoch': 1.22}
{'loss': 0.5013, 'learning_rate': 7.54055197705581e-06, 'epoch': 1.25}
{'loss': 0.4979, 'learning_rate': 7

  0%|          | 0/7090 [00:00<?, ?it/s]

{'eval_loss': 0.5007114410400391, 'eval_accuracy': 0.7844464818863578, 'eval_runtime': 90.9365, 'eval_samples_per_second': 467.766, 'eval_steps_per_second': 77.967, 'epoch': 2.0}
{'train_runtime': 2464.7725, 'train_samples_per_second': 103.549, 'train_steps_per_second': 17.258, 'train_loss': 0.5245063800256664, 'epoch': 2.0}


TrainOutput(global_step=42538, training_loss=0.5245063800256664, metrics={'train_runtime': 2464.7725, 'train_samples_per_second': 103.549, 'train_steps_per_second': 17.258, 'train_loss': 0.5245063800256664, 'epoch': 2.0})

In [8]:
# get f1 score of eval dataset
trainer.evaluate()

  0%|          | 0/7090 [00:00<?, ?it/s]

{'eval_loss': 0.4974004626274109,
 'eval_accuracy': 0.7702470790135647,
 'eval_runtime': 91.1727,
 'eval_samples_per_second': 466.554,
 'eval_steps_per_second': 77.764,
 'epoch': 2.0}

In [10]:
f1_scores

[{'f1': 0.5240808375943511},
 {'f1': 0.5465156535931549},
 {'f1': 0.5240808375943511}]