In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=123)

import os

### To obtain the predictions:
1. Replace the location of the test file and the output_path
2. Run the two commands to train our approach for validity and novelty (During training, the predictions for the test file would be generated after every epoch)
3. Extract the predictions as liklihood of novelty and validity

In [2]:
output_path = "../../data-ceph/arguana/argmining22-sharedtask/models/multitask" # Replace this with your equivelant path in your docker image: /mnt/ceph/storage/data-in-progress/data-research/arguana/argmining22-sharedtask/models/multitask

### Prepare the data:

In [3]:
taska_training_df = pd.read_csv('../data/TaskA_train.csv')
taska_valid_df    = pd.read_csv('../data/TaskA_dev.csv')
taska_test_df     = pd.read_csv('../data/TaskA_test.csv') #Replace this with the path to the test file

taska_training_df.insert(loc=0,column='row_num',value=np.arange(len(taska_training_df)))
taska_valid_df.insert(loc=0,column='row_num',value=np.arange(len(taska_valid_df)))
taska_test_df.insert(loc=0,column='row_num',value=np.arange(len(taska_test_df)))

#The </s></s>  is the separator used in the pre-trained nli model..
taska_training_df['input_txt'] = taska_training_df.apply(lambda x: '{}:{}  </s></s> {} '.format(x['topic'], x['Premise'], x['Conclusion']), axis=1)
taska_valid_df['input_txt']    = taska_valid_df.apply(lambda x: '{}:{} </s></s> {}'.format(x['topic'], x['Premise'], x['Conclusion']), axis=1)
taska_test_df['input_txt']     = taska_test_df.apply(lambda x: '{}:{} </s></s> {}'.format(x['topic'], x['Premise'], x['Conclusion']), axis=1)

taska_validity_train_df = taska_training_df[taska_training_df.Validity != 0].copy()
taska_validity_valid_df = taska_valid_df[taska_valid_df.Validity != 0].copy()
taska_validity_test_df  = taska_test_df[taska_test_df.Validity != 0].copy()

taska_validity_train_df['label'] = taska_validity_train_df.Validity.apply(lambda x : "valid" if x == 1 else "invalid")
taska_validity_valid_df['label'] = taska_validity_valid_df.Validity.apply(lambda x : "valid" if x == 1 else "invalid")
taska_validity_test_df['label']  = taska_validity_test_df.Validity.apply(lambda x  : "valid" if x == 1 else "invalid")


taska_novelty_train_df = taska_training_df[taska_training_df.Novelty != 0].copy()
taska_novelty_valid_df = taska_valid_df[taska_valid_df.Novelty != 0].copy()
taska_novelty_test_df  = taska_test_df[taska_test_df.Novelty != 0].copy()

#Balancing the data for novelty task..
taska_novelty_train_balanced_df, y = ros.fit_resample(taska_novelty_train_df, taska_novelty_train_df['Novelty'])
taska_novelty_train_balanced_df['Novelty'] = y

taska_novelty_train_df['label'] = taska_novelty_train_df.Novelty.apply(lambda x : "novel" if x == 1 else "conservative")
taska_novelty_train_balanced_df['label'] = taska_novelty_train_balanced_df.Novelty.apply(lambda x : "novel" if x == 1 else "conservative")
taska_novelty_valid_df['label'] = taska_novelty_valid_df.Novelty.apply(lambda x : "novel" if x == 1 else "conservative")
taska_novelty_test_df['label']  = taska_novelty_test_df.Novelty.apply(lambda x  : "novel" if x == 1 else "conservative")

In [4]:
taska_validity_train_df[['row_num',  'label', 'input_txt']].to_csv('../data/multitask_data/validity_training_df.tsv', sep='\t', header=False, index=False)
taska_validity_valid_df[['row_num', 'label', 'input_txt' ]].to_csv('../data/multitask_data/validity_valid_df.tsv', sep='\t', header=False, index=False)
taska_validity_test_df[['row_num', 'label', 'input_txt' ]].to_csv('../data/multitask_data/validity_test_df.tsv', sep='\t', header=False, index=False)

taska_novelty_train_df[['row_num', 'label', 'input_txt']].to_csv('../data/multitask_data/novelty_training_df.tsv', sep='\t', header=False, index=False)
taska_novelty_train_balanced_df[['row_num', 'label', 'input_txt']].to_csv('../data/multitask_data/novelty_training_balanced_df.tsv', sep='\t', header=False, index=False)
taska_novelty_valid_df[['row_num', 'label', 'input_txt']].to_csv('../data/multitask_data/novelty_valid_df.tsv', sep='\t', header=False, index=False)
taska_novelty_test_df[['row_num', 'label', 'input_txt']].to_csv('../data/multitask_data/novelty_test_df.tsv', sep='\t', header=False, index=False)

### Train our Approach:
- We train one multitask model for novelty and one for validity with different task weights and learning rates.

In [10]:
! sh ./src-py/run_nli_based_multitask_experiment.sh ../data/multitask_data/nli_based_multitask_for_novelty.yml \
                                          ../data/multitask_data/ \
                                          ../../data-ceph/arguana/argmining22-sharedtask/models/multitask/nli_model/final-multitask-for-novelty/ \
                                          /var/argmining-sharedtask/nli-final-multitask-for-novelty\
                                          roberta-large-mnli_prepared_data 2e-5 10

Preparing data...
task object created from task file...
loading file https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json from cache at /mnt/ceph/storage/data-tmp/current//sile2804/.cache/huggingface/transformers/64a1d72b2bd05b0aff1a4dd9e7a90a6eea0312b4f914e80b0a923aa8f72219bd.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt from cache at /mnt/ceph/storage/data-tmp/current//sile2804/.cache/huggingface/transformers/425529714b758f50b6d3f93f8093d859856fd41cf1cec7c8edf2ab44aee632b6.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-large-mnli/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-large-mnli/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-large-mnli/resolve/main/tokenizer_config.json from cache at None
loading configurati

In [11]:
! sh ./src-py/run_nli_based_multitask_experiment.sh ../data/multitask_data/nli_based_multitask_for_validity.yml \
                                          ../data/multitask_data/ \
                                          ../../data-ceph/arguana/argmining22-sharedtask/models/multitask/nli_model/final-multitask-for-validity/ \
                                          /var/argmining-sharedtask/nli-final-multitask-for-validity\
                                          roberta-large-mnli_prepared_data 5e-6 10

Preparing data...
task object created from task file...
loading file https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json from cache at /mnt/ceph/storage/data-tmp/current//sile2804/.cache/huggingface/transformers/64a1d72b2bd05b0aff1a4dd9e7a90a6eea0312b4f914e80b0a923aa8f72219bd.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt from cache at /mnt/ceph/storage/data-tmp/current//sile2804/.cache/huggingface/transformers/425529714b758f50b6d3f93f8093d859856fd41cf1cec7c8edf2ab44aee632b6.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-large-mnli/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-large-mnli/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-large-mnli/resolve/main/tokenizer_config.json from cache at None
loading configurati

#### Training on the balanced data:

In [5]:
! sh ./src-py/run_nli_based_multitask_experiment.sh ../data/multitask_data/nli_based_multitask_for_novelty.yml \
                                          ../data/multitask_data/ \
                                          ../../data-ceph/arguana/argmining22-sharedtask/models/multitask/nli_model/final-multitask-for-novelty-balanced/ \
                                          /var/argmining-sharedtask/nli-final-multitask-for-novelty-balanced\
                                          roberta-large-mnli_prepared_data 2e-5 10

Preparing data...
task object created from task file...
loading file https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json from cache at /mnt/ceph/storage/data-tmp/current//sile2804/.cache/huggingface/transformers/64a1d72b2bd05b0aff1a4dd9e7a90a6eea0312b4f914e80b0a923aa8f72219bd.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt from cache at /mnt/ceph/storage/data-tmp/current//sile2804/.cache/huggingface/transformers/425529714b758f50b6d3f93f8093d859856fd41cf1cec7c8edf2ab44aee632b6.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-large-mnli/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-large-mnli/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-large-mnli/resolve/main/tokenizer_config.json from cache at None
loading configurati

In [None]:
! sh ./src-py/run_nli_based_multitask_experiment.sh ../data/multitask_data/nli_based_multitask_for_validity.yml \
                                          ../data/multitask_data/ \
                                          ../../data-ceph/arguana/argmining22-sharedtask/models/multitask/nli_model/final-multitask-for-validity-balanced/ \
                                          /var/argmining-sharedtask/nli-final-multitask-for-validity-balanced\
                                          roberta-large-mnli_prepared_data 5e-6 10

Preparing data...
task object created from task file...
loading file https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json from cache at /mnt/ceph/storage/data-tmp/current//sile2804/.cache/huggingface/transformers/64a1d72b2bd05b0aff1a4dd9e7a90a6eea0312b4f914e80b0a923aa8f72219bd.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt from cache at /mnt/ceph/storage/data-tmp/current//sile2804/.cache/huggingface/transformers/425529714b758f50b6d3f93f8093d859856fd41cf1cec7c8edf2ab44aee632b6.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-large-mnli/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-large-mnli/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-large-mnli/resolve/main/tokenizer_config.json from cache at None
loading configurati

### Extract Predictions:

Look at the best epoch from the f1-score of the corrsponding task and choose the corrsponding generated predictions for the test.
- For novelty: The best f1-score on validation is 0.669 after the third epoch
- For validity: The best f1-score on validation is 0.73 after the third epoch

In [13]:
import sys
sys.path.append('./src-py')

from utils import *
from sklearn.metrics import precision_recall_fscore_support

In [27]:
#best_novelty_pred = '../../data-ceph/arguana/argmining22-sharedtask/models/multitask/nli_model/final-multitask-for-novelty/NoveltyTask_test_predictions_3.tsv'
#best_validity_pred = '../../data-ceph/arguana/argmining22-sharedtask/models/multitask/nli_model/final-multitask-for-validity/ValidityTask_test_predictions_4.tsv'

best_novelty_pred = '../../data-ceph/arguana/argmining22-sharedtask/models/multitask/nli_model/final-multitask-for-novelty-balanced/NoveltyTask_test_predictions_5.tsv'
best_validity_pred = '../../data-ceph/arguana/argmining22-sharedtask/models/multitask/nli_model/final-multitask-for-validity-balanced/ValidityTask_test_predictions_1.tsv'

In [28]:
novelty_predictions_df = pd.read_csv(best_novelty_pred, delimiter='\t')
validity_predictions_df = pd.read_csv(best_validity_pred, delimiter='\t')

#prediction dictionary
novelty_preds  = pd.Series(novelty_predictions_df.score.values, index=novelty_predictions_df.uid).to_dict()
validity_preds = pd.Series(validity_predictions_df.score.values, index=validity_predictions_df.uid).to_dict()

In [29]:
taska_test_df = taska_test_df[(taska_test_df.Novelty!=0) & (taska_test_df.Validity!=0)]

In [30]:
taska_test_df['is_validity'] = taska_test_df.row_num.apply(lambda x: validity_preds[x])
taska_test_df['is_novelty']  = taska_test_df.row_num.apply(lambda x: novelty_preds[x])


taska_test_df['predicted validity'] = taska_test_df.is_validity.apply(lambda x: 1 if x >= 0.5 else -1)
taska_test_df['predicted novelty']  = taska_test_df.is_novelty.apply(lambda x: 1 if x >= 0.5 else -1)

In [31]:
taska_test_df.head()

Unnamed: 0,row_num,topic,Premise,Conclusion,Validity,Validity-Confidence,Novelty,Novelty-Confidence,Topic-in-dev-split,input_txt,is_validity,is_novelty,predicted validity,predicted novelty,nli_single_is_validity,nli_single_is_novelty
0,0,Veal,Another selling point is that it's fast to coo...,"Veal appeals to young, fast-cooking",1,very confident,1,majority,no,Veal:Another selling point is that it's fast t...,0.741766,0.124622,1,-1,0.908244,0.707665
1,1,Veal,In addition to the unnecessary cruelty that mi...,Animal food is very expensive,-1,very confident,1,majority,no,Veal:In addition to the unnecessary cruelty th...,0.405237,0.114225,-1,-1,0.091664,0.293457
2,2,Veal,In addition to the unnecessary cruelty that mi...,Veal production is inefficient.,1,very confident,1,very confident,no,Veal:In addition to the unnecessary cruelty th...,0.329954,0.114642,-1,-1,0.90793,0.293558
3,3,Veal,In addition to the unnecessary cruelty that mi...,Veal meat production requires a huge amount of...,1,very confident,1,majority,no,Veal:In addition to the unnecessary cruelty th...,0.741555,0.12601,1,-1,0.908178,0.707968
4,4,Veal,In addition to the unnecessary cruelty that mi...,Veal production has a high carbon footprint.,-1,majority,-1,majority,no,Veal:In addition to the unnecessary cruelty th...,0.351044,0.118939,-1,-1,0.091688,0.707879


Now the `taska_test_df` dataframe contains the predictions for validity and novelty... Submit it please :)

#### Compute scores on Test Set:

In [32]:
precision_recall_fscore_support(taska_test_df.Validity.tolist(), taska_test_df['predicted validity'], average='macro')

(0.734085414987913, 0.6976068270360523, 0.703686575052854, None)

In [34]:
precision_recall_fscore_support(taska_test_df.Novelty.tolist(), taska_test_df['predicted novelty'], average='macro')

(0.6346097201767305, 0.5343898621395461, 0.45524028743009753, None)

In [35]:
taska_test_df[['row_num', 'topic', 'Premise', 'Conclusion', 'predicted validity', 'predicted novelty']].to_csv('../data/output/final_predictions-on-balanced-data.csv')

In [37]:
#! python ../Evaluator.py A ../data/TaskA_test.csv ../data/output/final_predictions.csv --verbose
! python ../Evaluator.py A ../data/TaskA_test.csv ../data/output/final_predictions-on-balanced-data.csv --verbose

[32m2022-08-24 14:32:13.891[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mReference: ../data/TaskA_test.csv <> Predictions: ../data/output/final_predictions-on-balanced-data.csv[0m
[32m2022-08-24 14:32:13.941[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m61[0m - [34m[1mFound following keys: : 520 rows / in_topic: 190 rows / out_topic: 330 rows[0m
[32m2022-08-24 14:32:13.945[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m118[0m - [1mCalculated the scores: {'Novelty__-1_f1': 0.7265725288831836,
 'Novelty__-1_precision': 0.5835051546391753,
 'Novelty__-1_recall': 0.9625850340136054,
 'Novelty__1_f1': 0.1839080459770115,
 'Novelty__1_precision': 0.6857142857142857,
 'Novelty__1_recall': 0.10619469026548672,
 'Novelty__f1': 0.45524028743009753,
 'Novelty__precision': 0.6346097201767305,
 'Novelty__recall': 0.5343898621395461,
 'Novelty_in_topic_-1_f1': 0.7947019867549668,
 'Novelty_in_topic_-1_precision':

#### Compute baseline scores:

In [19]:
#singl_novelty_pred = "../../data-ceph/arguana/argmining22-sharedtask/models/multitask/nli_model/novelty/NoveltyTask_test_predictions_8.tsv"
single_validity_pred  = "../../data-ceph/arguana/argmining22-sharedtask/models/multitask/nli_model/novelty/NoveltyTask_test_predictions_8.tsv"

singl_novelty_pred = "../../data-ceph/arguana/argmining22-sharedtask/models/multitask/nli_model/balanced-novelty/NoveltyTask_test_predictions_2.tsv"

In [20]:
novelty_predictions_df = pd.read_csv(singl_novelty_pred, delimiter='\t')
validity_predictions_df = pd.read_csv(single_validity_pred, delimiter='\t')

#prediction dictionary
novelty_preds  = pd.Series(novelty_predictions_df.score.values, index=novelty_predictions_df.uid).to_dict()
validity_preds = pd.Series(validity_predictions_df.score.values, index=validity_predictions_df.uid).to_dict()

In [21]:
taska_test_df = taska_test_df[(taska_test_df.Novelty!=0) & (taska_test_df.Validity!=0)]

In [22]:
taska_test_df['nli_single_is_validity'] = taska_test_df.row_num.apply(lambda x: validity_preds[x])
taska_test_df['nli_single_is_novelty']  = taska_test_df.row_num.apply(lambda x: novelty_preds[x])

taska_test_df['predicted validity'] = taska_test_df.nli_single_is_validity.apply(lambda x: 1 if x >= 0.5 else -1)
taska_test_df['predicted novelty']  = taska_test_df.nli_single_is_novelty.apply(lambda x: 1 if x >= 0.5 else -1)

In [23]:
taska_test_df[['row_num', 'topic', 'Premise', 'Conclusion', 'predicted validity', 'predicted novelty']].to_csv('../data/output/nli_single_predictions-on-balanced-data.csv')

In [None]:
! python ../Evaluator.py A ../data/TaskA_test.csv ../data/output/nli_single_predictions-on-balanced-data.csv --verbose

[32m2022-08-24 14:17:24.620[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mReference: ../data/TaskA_test.csv <> Predictions: ../data/output/nli_single_predictions-on-balanced-data.csv[0m
[32m2022-08-24 14:17:24.671[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m61[0m - [34m[1mFound following keys: : 520 rows / in_topic: 190 rows / out_topic: 330 rows[0m
[32m2022-08-24 14:17:24.674[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m118[0m - [1mCalculated the scores: {'Novelty__-1_f1': 0.7298136645962732,
 'Novelty__-1_precision': 0.6714285714285714,
 'Novelty__-1_recall': 0.7993197278911565,
 'Novelty__1_f1': 0.5606060606060606,
 'Novelty__1_precision': 0.6529411764705882,
 'Novelty__1_recall': 0.4911504424778761,
 'Novelty__f1': 0.6452098626011669,
 'Novelty__precision': 0.6621848739495798,
 'Novelty__recall': 0.6452350851845163,
 'Novelty_in_topic_-1_f1': 0.8075471698113208,
 'Novelty_in_topic_-1_precisio

In [22]:
! python ../Evaluator.py A ../data/TaskA_test.csv ../data/output/roberta_single_predictions.csv --verbose

[32m2022-08-22 16:27:39.889[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mReference: ../data/TaskA_test.csv <> Predictions: ../data/output/roberta_single_predictions.csv[0m
[32m2022-08-22 16:27:39.943[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m61[0m - [34m[1mFound following keys: : 520 rows / in_topic: 190 rows / out_topic: 330 rows[0m
[32m2022-08-22 16:27:39.946[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m118[0m - [1mCalculated the scores: {'Novelty__-1_f1': 0.7223587223587223,
 'Novelty__-1_precision': 0.5653846153846154,
 'Novelty__-1_recall': 1.0,
 'Novelty__1_f1': 0.0,
 'Novelty__1_precision': 0.0,
 'Novelty__1_recall': 0.0,
 'Novelty__f1': 0.36117936117936117,
 'Novelty__precision': 0.2826923076923077,
 'Novelty__recall': 0.5,
 'Novelty_in_topic_-1_f1': 0.7936507936507937,
 'Novelty_in_topic_-1_precision': 0.6578947368421053,
 'Novelty_in_topic_-1_recall': 1.0,
 'Novelty_in_topic_1_f1': 0

In [25]:
! python ../Evaluator.py A ../data/TaskA_test.csv ../data/output/balanced_roberta_single_predictions.csv --verbose

[32m2022-08-24 14:19:01.039[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mReference: ../data/TaskA_test.csv <> Predictions: ../data/output/balanced_roberta_single_predictions.csv[0m
[32m2022-08-24 14:19:01.110[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m61[0m - [34m[1mFound following keys: : 520 rows / in_topic: 190 rows / out_topic: 330 rows[0m
[32m2022-08-24 14:19:01.114[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m118[0m - [1mCalculated the scores: {'Novelty__-1_f1': 0.619047619047619,
 'Novelty__-1_precision': 0.5803571428571429,
 'Novelty__-1_recall': 0.6632653061224489,
 'Novelty__1_f1': 0.41463414634146334,
 'Novelty__1_precision': 0.46195652173913043,
 'Novelty__1_recall': 0.37610619469026546,
 'Novelty__f1': 0.5168408826945412,
 'Novelty__precision': 0.5211568322981367,
 'Novelty__recall': 0.5196857504063572,
 'Novelty_in_topic_-1_f1': 0.7169811320754718,
 'Novelty_in_topic_-1_precision'