In [1]:
!pip install torch torchvision
!pip install transformers
!pip install tensorboardx
!pip install simpletransformers



In [2]:
import torch

torch.manual_seed(42)

<torch._C.Generator at 0x796a802c54f0>

In [3]:
import warnings

# mount the google drive
from google.colab import drive
drive.mount('/content/gdrive')

warnings.filterwarnings('ignore')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
from simpletransformers.classification import ClassificationModel
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from scipy.special import softmax


df_train = pd.read_csv('gdrive/MyDrive/data/comparg_train.tsv', sep='\t', encoding='utf-8')
df_test = pd.read_csv('gdrive/MyDrive/data/comparg_test.tsv', sep='\t', encoding='utf-8')

#df_train.head()

In [5]:
def mask_objects(row):
    row["text"] = row["answer"].replace(row["object_0"],"[FIRST_ENTITY]").replace(row["object_1"],"[SECOND_ENTITY]")
    return row

def transform_data(df):
  df = df.apply(mask_objects,axis=1)
  return df


In [6]:
df_train = transform_data(df_train).sample(frac=1, random_state=42).reset_index(drop=True)
df_test = transform_data(df_test)

In [7]:
args = {"overwrite_output_dir": True,
        "num_train_epochs": 10,
        "fp16": False,
        "train_batch_size": 16,
        "gradient_accumulation_steps":1,
        "evaluate_during_training": False,
        "max_seq_length": 64,
        "learning_rate": 3e-5,
        "no_cache": True,
        #"save_model_every_epoch": False,
        "reprocess_input_data": True,
        "output_dir": "gdrive/MyDrive/checkpoints"}

model = ClassificationModel("roberta", "roberta-large", num_labels=4, use_cuda=torch.cuda.is_available())
model.train_model(df_train, args=args)

_, raw_outputs, _ = model.eval_model(df_test)
pred_probs = softmax(raw_outputs, axis=1)
pred_labels = np.argmax(raw_outputs, axis=1)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5759 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/360 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/360 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/360 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/360 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/360 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/360 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/360 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/360 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/360 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/360 [00:00<?, ?it/s]

  0%|          | 0/1440 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/180 [00:00<?, ?it/s]

In [8]:
print(classification_report(y_true=df_test.labels.tolist(), y_pred=pred_labels))

              precision    recall  f1-score   support

           0       0.94      0.95      0.94      1048
           2       0.85      0.83      0.84       273
           3       0.68      0.68      0.68       119

    accuracy                           0.90      1440
   macro avg       0.82      0.82      0.82      1440
weighted avg       0.90      0.90      0.90      1440



In [9]:
test_model = ClassificationModel("roberta", "gdrive/MyDrive/checkpoints", num_labels=4, use_cuda=torch.cuda.is_available())

_, raw_outputs, _ = test_model.eval_model(df_test)
pred_probs_test = softmax(raw_outputs, axis=1)
pred_labels_test = np.argmax(raw_outputs, axis=1)

  0%|          | 0/1440 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/180 [00:00<?, ?it/s]

In [10]:
print(classification_report(y_true=df_test.labels.tolist(), y_pred=pred_labels_test))

              precision    recall  f1-score   support

           0       0.94      0.95      0.94      1048
           2       0.85      0.83      0.84       273
           3       0.68      0.68      0.68       119

    accuracy                           0.90      1440
   macro avg       0.82      0.82      0.82      1440
weighted avg       0.90      0.90      0.90      1440



In [11]:
test_model = ClassificationModel("roberta", "gdrive/MyDrive/checkpoints/checkpoint-1800-epoch-5", num_labels=4, use_cuda=torch.cuda.is_available())

_, raw_outputs, _ = test_model.eval_model(df_test)
pred_probs_test = softmax(raw_outputs, axis=1)
pred_labels_test = np.argmax(raw_outputs, axis=1)

print(classification_report(y_true=df_test.labels.tolist(), y_pred=pred_labels_test))

  0%|          | 0/1440 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/180 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1048
           2       0.87      0.83      0.85       273
           3       0.73      0.65      0.68       119

    accuracy                           0.91      1440
   macro avg       0.84      0.81      0.83      1440
weighted avg       0.90      0.91      0.90      1440



In [12]:
#list(pred_probs_test)

In [13]:
df_test["pred_labels"] = pred_labels_test
df_test["pred_probs"] = list(pred_probs_test)
df_test.to_csv('gdrive/MyDrive/data/5_epochs_comparg_test.tsv', sep='\t', index=False)

In [15]:

test_model = ClassificationModel("roberta", "gdrive/MyDrive/checkpoints/checkpoint-2880-epoch-8", num_labels=4, use_cuda=torch.cuda.is_available())

_, raw_outputs, _ = test_model.eval_model(df_test)
pred_probs_test = softmax(raw_outputs, axis=1)
pred_labels_test = np.argmax(raw_outputs, axis=1)

print(classification_report(y_true=df_test.labels.tolist(), y_pred=pred_labels_test))

  0%|          | 0/1440 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/180 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.96      0.93      0.95      1048
           2       0.84      0.86      0.85       273
           3       0.66      0.76      0.71       119

    accuracy                           0.91      1440
   macro avg       0.82      0.85      0.84      1440
weighted avg       0.91      0.91      0.91      1440



In [16]:

test_model = ClassificationModel("roberta", "gdrive/MyDrive/checkpoints/checkpoint-3600-epoch-10", num_labels=4, use_cuda=torch.cuda.is_available())

_, raw_outputs, _ = test_model.eval_model(df_test)
pred_probs_test = softmax(raw_outputs, axis=1)
pred_labels_test = np.argmax(raw_outputs, axis=1)

print(classification_report(y_true=df_test.labels.tolist(), y_pred=pred_labels_test))

  0%|          | 0/1440 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/180 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.94      0.95      0.94      1048
           2       0.85      0.83      0.84       273
           3       0.68      0.68      0.68       119

    accuracy                           0.90      1440
   macro avg       0.82      0.82      0.82      1440
weighted avg       0.90      0.90      0.90      1440

