In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    pairwise,
)
from torch.utils.data import DataLoader

import torch
import pandas as pd

from datasets import Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments

from utils.data_preprocessing import preprocess_data, split_data, split_data_all

%load_ext autoreload
%autoreload 2

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
topics, opinions, conclusions = preprocess_data()

In [3]:
label_mapping = {
    'Claim': 'Supportive',
    # 'Evidence': 'Supportive',
    # 'Rebuttal': 'Supportive',
    # 'Position': 'Supportive',
    'Counterclaim': 'Opposing'
}

opinions = opinions.merge(
    topics[['topic_id', 'text']],
    on='topic_id',
    suffixes=('_opinion', '_topic')
)

opinions['stance'] = opinions['type'].map(label_mapping)
opinions = opinions.dropna(subset=['stance'])

label_to_id = {
    'Opposing': 0,
    'Supportive': 1
}

opinions['label'] = opinions['stance'].map(label_to_id)

In [4]:
train_data, val_data, test_data = split_data_all(opinions, val_size=0.15)

In [5]:
train_dataset = Dataset.from_pandas(train_data[['text_topic', 'text_opinion', 'label']].reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_data[['text_topic', 'text_opinion', 'label']].reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_data[['text_topic', 'text_opinion', 'label']].reset_index(drop=True))

In [6]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def tokenize(batch):
    return tokenizer(
        batch['text_topic'],
        batch['text_opinion'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

train_encoding = train_dataset.map(tokenize, batched=True)
val_encoding = val_dataset.map(tokenize, batched=True)
test_encoding = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/10322 [00:00<?, ? examples/s]

Map:   0%|          | 0/1989 [00:00<?, ? examples/s]

Map:   0%|          | 0/1427 [00:00<?, ? examples/s]

In [8]:
columns = ['input_ids', 'token_type_ids', 'attention_mask', 'label']

train_encoding.set_format(type='torch', columns=columns)
val_encoding.set_format(type='torch', columns=columns)
test_encoding.set_format(type='torch', columns=columns)

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [10]:
training_args = TrainingArguments(
    output_dir='./saved_models/classification',
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=200,
    # learning_rate=3e-03,
    weight_decay=0.01,
    optim="adamw_torch",
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='steps',
    eval_steps=50,
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoding,
    eval_dataset=val_encoding,
    compute_metrics=compute_metrics
)

trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.4978,0.393307,0.869281,0.93007,0.869281,1.0
100,0.2843,0.280043,0.887883,0.93932,0.886948,0.998265
150,0.2766,0.243087,0.911011,0.950071,0.927313,0.973973
200,0.2985,0.275025,0.90548,0.945093,0.954572,0.935801
250,0.2017,0.22827,0.917547,0.953881,0.928298,0.980914
300,0.225,0.25226,0.916541,0.952975,0.933926,0.972817
350,0.1702,0.252767,0.922574,0.956374,0.937257,0.976287
400,0.2107,0.24796,0.922071,0.956276,0.93337,0.980335
450,0.2261,0.228452,0.922071,0.955241,0.953864,0.956622
500,0.1701,0.217331,0.927602,0.958857,0.947487,0.970503


TrainOutput(global_step=646, training_loss=0.23133196679431217, metrics={'train_runtime': 545.0657, 'train_samples_per_second': 37.874, 'train_steps_per_second': 1.185, 'total_flos': 1357916156712960.0, 'train_loss': 0.23133196679431217, 'epoch': 2.0})

In [11]:
trainer.evaluate()

{'eval_loss': 0.23433418571949005,
 'eval_accuracy': 0.9270990447461036,
 'eval_f1': 0.9585358879039176,
 'eval_precision': 0.9479638009049773,
 'eval_recall': 0.9693464430306535,
 'eval_runtime': 12.4501,
 'eval_samples_per_second': 159.757,
 'eval_steps_per_second': 2.57,
 'epoch': 2.0}

In [12]:
predictions = trainer.predict(test_encoding)
# preds = np.argmax(predictions.predictions, axis=-1)
# id_to_label = {0: 'Opposing', 1: 'Supportive'}
# predicted_labels = [id_to_label[pred] for pred in preds]

In [14]:
predictions.metrics

{'test_loss': 0.26490676403045654,
 'test_accuracy': 0.920812894183602,
 'test_f1': 0.9555293191656828,
 'test_precision': 0.946219797349961,
 'test_recall': 0.9650238473767886,
 'test_runtime': 8.9506,
 'test_samples_per_second': 159.431,
 'test_steps_per_second': 2.57}

In [15]:
trainer.save_model('saved_models/classification/')

In [16]:
tokenizer.save_pretrained('saved_models/classification/')

('saved_models/classification/tokenizer_config.json',
 'saved_models/classification/special_tokens_map.json',
 'saved_models/classification/vocab.txt',
 'saved_models/classification/added_tokens.json',
 'saved_models/classification/tokenizer.json')