In [1]:
import random
import csv
import sklearn.metrics
import evaluate

import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import torch

from transformers import TrainingArguments, Trainer
from torch import nn, tensor
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from datetime import datetime
from os import path
from scipy.stats import spearmanr


from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
train_data = pd.read_csv("train_data.csv", encoding='cp437')
test_data = pd.read_csv("test_data.csv", encoding='cp437')


test_data, val_data = train_test_split(test_data, test_size = 0.50)

In [3]:
#from datasets import Dataset, DatasetDict
#datasets_dict = DatasetDict({
#    "train": Dataset.from_pandas(train_data).remove_columns(column_names="__index_level_0__"),
#    "test": Dataset.from_pandas(test_data).remove_columns(column_names="__index_level_0__"),
#    "val": Dataset.from_pandas(val_data).remove_columns(column_names="__index_level_0__")
#})

from datasets import Dataset, DatasetDict
datasets_dict = DatasetDict({
 "train": Dataset.from_pandas(train_data[['text']]),
    "test": Dataset.from_pandas(test_data[['text']]),
    "val": Dataset.from_pandas(val_data[['text']])
})

In [4]:
# Tokenizer
model_type = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_type)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = datasets_dict.map(tokenize_function, batched=True)
toke_train = tokenized_datasets['train'].shuffle()
toke_test = tokenized_datasets['test'].shuffle()
toke_val = tokenized_datasets['val'].shuffle()


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
# Creating Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(
    model_type, num_labels=4).to(device)


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("label")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(
            weight=tensor([1.0, 1.0, 1.0])).to(device)
        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


Downloading pytorch_model.bin:   0%|          | 0.00/478M [00:00<?, ?B/s]

In [None]:
# Creating Training Arguments

training_args = TrainingArguments(output_dir="test_trainer",
                                  logging_dir="test_trainer_tf",
                                  evaluation_strategy="epoch",
                                  num_train_epochs=10,
                                  warmup_ratio=0.1,
                                  learning_rate=1.5e-6,
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8
                                  )


In [None]:
# Creating Metrics for Training

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=toke_train,
        eval_dataset=toke_test,
        compute_metrics=compute_metrics,
    )

inputs = {"input_ids": toke_train["input_ids"], "attention_mask": toke_train["attention_mask"], "labels": toke_train["label"]}
trainer.train(inputs)


In [None]:
# Predict on validation
predictions = trainer.predict(toke_val)

# Normalize probabilities
probabilities = tf.nn.softmax(predictions.predictions)
pred = np.argmax(probabilities, axis=1)

# Evaluate on validation
evaluation = trainer.evaluate(toke_val)

In [None]:
# Calculating Custom Metrics
truth_labels = toke_val["label"]
results = pred == truth_labels
count = np.count_nonzero(results)

accuracy = count/truth_labels.__len__()
balance_acc = sklearn.metrics.balanced_accuracy_score(truth_labels, pred)
roc_auc = sklearn.metrics.roc_auc_score(
    truth_labels, probabilities, multi_class='ovo', average="macro")
rho, p = spearmanr(truth_labels, pred)

print(f"Metrics:", "\n", "Accuracy:", round(accuracy, ndigits=2), "\n",
      "Balanced Accuracy:", round(balance_acc, ndigits=2), "\n",
      "ROC_AUC:", round(roc_auc, ndigits=2), "\n",
      "Spearman Rank", round(rho, ndigits=2))


# Print metrics
print(f"Metrics:", "\n", "Accuracy:", round(evaluation["accuracy"], ndigits=2), "\n",
      "Balanced Accuracy:", round(evaluation["balanced_accuracy"], ndigits=2), "\n",
      "ROC_AUC:", round(evaluation["roc_auc"], ndigits=2), "\n",
      "Spearman Rank", round(evaluation["spearmanr"], ndigits=2))


In [None]:
# Prediction Set

#dev_path = pd.read_csv("data/test_prepros.csv")

#dev_DataSet = Dataset.from_pandas(dev_path)

#dev_toke = dev_DataSet.map(tokenize_function, batched=True)


#predictions = trainer().predict(dev_toke)
