In [46]:
import json
import pandas as pd
import numpy as np
import sys

pd.set_option('display.max_colwidth', None)
sys.path.append('./src-py')

In [47]:
import sbert_training
from utils import *

In [48]:
from datasets import load_dataset, load_metric, Dataset, Split
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, DebertaForSequenceClassification
from transformers import TrainingArguments, Trainer
import wandb
import torch
from tqdm import tqdm

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=123)

In [49]:
output_path = "../../data-ceph/arguana/argmining22-sharedtask/models/"

In [50]:
taska_training_df = pd.read_csv('../data/TaskA_train.csv')
taska_valid_df    = pd.read_csv('../data/TaskA_dev.csv')
taska_test_df     = pd.read_csv('../data/TaskA_test.csv') #Replace this with the path to the test file

taska_training_df.insert(loc=0,column='row_num',value=np.arange(len(taska_training_df)))
taska_valid_df.insert(loc=0,column='row_num',value=np.arange(len(taska_valid_df)))
taska_test_df.insert(loc=0,column='row_num',value=np.arange(len(taska_test_df)))

#The </s></s>  is the separator used in the pre-trained nli model..
taska_training_df['input_txt'] = taska_training_df.apply(lambda x: '{}:{}  </s></s> {} '.format(x['topic'], x['Premise'], x['Conclusion']), axis=1)
taska_valid_df['input_txt']    = taska_valid_df.apply(lambda x: '{}:{} </s></s> {}'.format(x['topic'], x['Premise'], x['Conclusion']), axis=1)
taska_test_df['input_txt']     = taska_test_df.apply(lambda x: '{}:{} </s></s> {}'.format(x['topic'], x['Premise'], x['Conclusion']), axis=1)

taska_validity_train_df = taska_training_df[taska_training_df.Validity != 0].copy()
taska_validity_valid_df = taska_valid_df[taska_valid_df.Validity != 0].copy()
taska_validity_test_df  = taska_test_df[taska_test_df.Validity != 0].copy()

taska_validity_train_df['label'] = taska_validity_train_df.Validity.apply(lambda x : "valid" if x == 1 else "invalid")
taska_validity_valid_df['label'] = taska_validity_valid_df.Validity.apply(lambda x : "valid" if x == 1 else "invalid")
taska_validity_test_df['label']  = taska_validity_test_df.Validity.apply(lambda x  : "valid" if x == 1 else "invalid")


taska_novelty_train_df = taska_training_df[taska_training_df.Novelty != 0].copy()
taska_novelty_valid_df = taska_valid_df[taska_valid_df.Novelty != 0].copy()
taska_novelty_test_df  = taska_test_df[taska_test_df.Novelty != 0].copy()

#Balancing the data for novelty task..
taska_novelty_train_balanced_df, y = ros.fit_resample(taska_novelty_train_df, taska_novelty_train_df['Novelty'])
taska_novelty_train_balanced_df['Novelty'] = y

taska_novelty_train_df['label'] = taska_novelty_train_df.Novelty.apply(lambda x : "novel" if x == 1 else "conservative")
taska_novelty_train_balanced_df['label'] = taska_novelty_train_balanced_df.Novelty.apply(lambda x : "novel" if x == 1 else "conservative")
taska_novelty_valid_df['label'] = taska_novelty_valid_df.Novelty.apply(lambda x : "novel" if x == 1 else "conservative")
taska_novelty_test_df['label']  = taska_novelty_test_df.Novelty.apply(lambda x  : "novel" if x == 1 else "conservative")

## Fine-tune simple RoBERTa model on the training data for Novelty:

In [73]:
bert_tokenizer = AutoTokenizer.from_pretrained('roberta-base')

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /mnt/ceph/storage/data-tmp/current//sile2804/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.

In [74]:
taska_novelty_train_df.columns

Index(['row_num', 'topic', 'Premise', 'Conclusion', 'Validity',
       'Validity-Confidence', 'Novelty', 'Novelty-Confidence', 'input_txt',
       'label'],
      dtype='object')

In [75]:
taska_novelty_train_df.Novelty.value_counts()

-1    595
 1    123
Name: Novelty, dtype: int64

In [76]:
validity_map = dict([ # avoid negative labels
    ("novel", 1), 
    ("conservative", 0)
])

In [77]:
train_dataset = Dataset.from_pandas(taska_novelty_train_df)
eval_dataset = Dataset.from_pandas(taska_novelty_valid_df)
test_dataset = Dataset.from_pandas(taska_novelty_test_df)

In [78]:
def preprocess(example):
    inputs = bert_tokenizer(example["input_txt"], add_special_tokens=False, padding=True, truncation=True, max_length=512)
    inputs['label'] = list(map(validity_map.get, example['label']))
    return inputs

In [79]:
train_dataset = train_dataset.map(preprocess, batched=True)
eval_dataset = eval_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [80]:
bert_model     = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

training_args = TrainingArguments(
    output_dir= output_path + "/novelty/roberta", 
    #report_to="wandb",
    logging_dir='/var/argmining-sharedtask/roberta-baseline-novelty',
    overwrite_output_dir=True,
    metric_for_best_model = 'f1',
    evaluation_strategy = 'steps',          # check evaluation metrics at each epoch
    learning_rate = 5e-6,                   # we can customize learning rate
    max_steps = 600,
    logging_steps = 50,                    # we will log every 50 steps which is an epoch given the 700 examples and 16 batch size
    eval_steps = 50,                      # we will perform evaluation every 500 steps
    save_steps = 50,
    load_best_model_at_end = True,
)

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=lambda x: compute_metrics(x, average='macro')
)

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /mnt/ceph/storage/data-tmp/current//sile2804/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights fil

In [81]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, input_txt. If Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, input_txt are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 718
  Num Epochs = 7
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 600
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Recall,Precision,F1
50,0.5762,0.817878,0.5,0.295,0.371069
100,0.4671,0.775655,0.5,0.295,0.371069
150,0.4763,0.831936,0.5,0.295,0.371069
200,0.466,0.84652,0.5,0.295,0.371069
250,0.4396,0.94212,0.5,0.295,0.371069
300,0.5223,0.786422,0.5,0.295,0.371069
350,0.4073,0.956316,0.5,0.295,0.371069
400,0.5138,0.830384,0.5,0.295,0.371069
450,0.3935,0.901612,0.5,0.295,0.371069
500,0.4275,0.901559,0.5,0.295,0.371069


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, input_txt. If Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, input_txt are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ../../data-ceph/arguana/argmining22-sharedtask/models//novelty/roberta/checkpoint-50
Configuration saved in ../../data-ceph/arguana/argmining22-sharedtask/models//novelty/roberta/checkpoint-50/config.json
Model weights saved in ../../data-ceph/arguana/argmining22-sharedtask/models//novelty/roberta/checkpoint-50/pytorch_model.bin
The followi

TrainOutput(global_step=600, training_loss=0.46075950304667157, metrics={'train_runtime': 59.4436, 'train_samples_per_second': 80.749, 'train_steps_per_second': 10.094, 'total_flos': 305101935351360.0, 'train_loss': 0.46075950304667157, 'epoch': 6.67})

In [82]:
trainer.evaluate(test_dataset)

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, Topic-in-dev-split, input_txt. If Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, Topic-in-dev-split, input_txt are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 520
  Batch size = 8


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.8431864976882935,
 'eval_recall': 0.5,
 'eval_precision': 0.2826923076923077,
 'eval_f1': 0.36117936117936117,
 'eval_runtime': 0.7183,
 'eval_samples_per_second': 723.928,
 'eval_steps_per_second': 90.491,
 'epoch': 6.67}

In [83]:
results = trainer.predict(test_dataset)

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, Topic-in-dev-split, input_txt. If Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, Topic-in-dev-split, input_txt are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 520
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))


In [84]:
taska_test_df['predicted novelty']  = [1 if np.argmax(x) == 1 else -1 for x in results.predictions]

## Fine-tune simple RoBERTa model on the training data for Validity:

In [85]:
train_dataset = Dataset.from_pandas(taska_validity_train_df)
eval_dataset = Dataset.from_pandas(taska_validity_valid_df)
test_dataset = Dataset.from_pandas(taska_validity_test_df)

In [86]:
validity_map = dict([ # avoid negative labels
    ("valid", 1), 
    ("invalid", 0)
])

train_dataset = train_dataset.map(preprocess, batched=True)
eval_dataset = eval_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [87]:
taska_validity_train_df.label.value_counts()

valid      401
invalid    320
Name: label, dtype: int64

In [88]:
bert_model     = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

training_args = TrainingArguments(
    output_dir= output_path + "/validity/roberta", 
    #report_to="wandb",
    logging_dir='/var/argmining-sharedtask/roberta-baseline-validity',
    overwrite_output_dir=True,
    metric_for_best_model = 'f1',
    evaluation_strategy = 'steps',          # check evaluation metrics at each epoch
    learning_rate = 5e-6,                   # we can customize learning rate
    max_steps = 600,
    logging_steps = 50,                    # we will log every 50 steps which is an epoch given the 700 examples and 16 batch size
    eval_steps = 50,                      # we will perform evaluation every 500 steps
    save_steps = 50,
    load_best_model_at_end = True,
)

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=lambda x: compute_metrics(x, average='macro')
)

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /mnt/ceph/storage/data-tmp/current//sile2804/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights fil

In [89]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, input_txt. If Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, input_txt are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 721
  Num Epochs = 7
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 600
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Recall,Precision,F1
50,0.6869,0.675806,0.5,0.31407,0.385802
100,0.6863,0.677795,0.5,0.31407,0.385802
150,0.6955,0.674904,0.5,0.31407,0.385802
200,0.6852,0.670072,0.5,0.31407,0.385802
250,0.6793,0.66452,0.5,0.31407,0.385802
300,0.6688,0.659185,0.496,0.313131,0.383901
350,0.596,0.640975,0.543838,0.596548,0.512653
400,0.5201,0.636395,0.605892,0.693195,0.596481
450,0.4654,0.62675,0.66973,0.713605,0.676423
500,0.4475,0.645163,0.639676,0.725087,0.639916


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, input_txt. If Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, input_txt are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 199
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ../../data-ceph/arguana/argmining22-sharedtask/models//validity/roberta/checkpoint-50
Configuration saved in ../../data-ceph/arguana/argmining22-sharedtask/models//validity/roberta/checkpoint-50/config.json
Model weights saved in ../../data-ceph/arguana/argmining22-sharedtask/models//validity/roberta/checkpoint-50/pytorch_model.bin
The foll

TrainOutput(global_step=600, training_loss=0.5810151735941569, metrics={'train_runtime': 63.7875, 'train_samples_per_second': 75.25, 'train_steps_per_second': 9.406, 'total_flos': 303190269089760.0, 'train_loss': 0.5810151735941569, 'epoch': 6.59})

In [90]:
trainer.evaluate(test_dataset)

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, Topic-in-dev-split, input_txt. If Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, Topic-in-dev-split, input_txt are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 520
  Batch size = 8


{'eval_loss': 0.7109214067459106,
 'eval_recall': 0.5717024302764208,
 'eval_precision': 0.6002074150894479,
 'eval_f1': 0.5621527777777778,
 'eval_runtime': 0.7111,
 'eval_samples_per_second': 731.235,
 'eval_steps_per_second': 91.404,
 'epoch': 6.59}

In [93]:
results = trainer.predict(test_dataset)
taska_test_df['predicted validity']  = [1 if np.argmax(x) == 1 else -1 for x in results.label_ids]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, Topic-in-dev-split, input_txt. If Novelty-Confidence, Premise, __index_level_0__, Validity, Validity-Confidence, Novelty, topic, row_num, Conclusion, Topic-in-dev-split, input_txt are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 520
  Batch size = 8


In [94]:
taska_test_df[['row_num', 'topic', 'Premise', 'Conclusion', 'predicted validity', 'predicted novelty']].to_csv('../data/output/roberta_single_predictions.csv')