In [1]:
%load_ext autoreload

In [2]:
import json
import pandas as pd
import numpy as np
import sys

pd.set_option('display.max_colwidth', None)
sys.path.append('./src-py')

In [18]:
%autoreload
import sbert_training
from sklearn.metrics import precision_recall_fscore_support

In [4]:
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryClassificationEvaluator
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import TripletEvaluator
from datetime import datetime
from sentence_transformers import util
from zipfile import ZipFile
from sentence_transformers.datasets import SentenceLabelDataset
from sentence_transformers.datasets import NoDuplicatesDataLoader

import logging

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

In [5]:
output_path = "../../data-ceph/arguana/argmining22-sharedtask/models/"

In [6]:
taska_training_df = pd.read_csv('../data/TaskA_train.csv')
taska_valid_df = pd.read_csv('../data/TaskA_dev.csv')

taska_training_df['Premise'] = taska_training_df.apply(lambda x: x['topic'] + ' : ' +  x['Premise'], axis=1)
taska_valid_df['Premise'] = taska_valid_df.apply(lambda x: x['topic'] + ' : ' +  x['Premise'], axis=1)

taska_validity_train_df = taska_training_df[taska_training_df.Validity != 0].copy()
taska_validity_valid_df = taska_valid_df[taska_valid_df.Validity != 0].copy()

taska_validity_train_df['label'] = taska_validity_train_df.Validity.apply(lambda x : 1 if x == 1 else 0)
taska_validity_valid_df['label'] = taska_validity_valid_df.Validity.apply(lambda x : 1 if x == 1 else 0)

taska_novelty_train_df = taska_training_df[taska_training_df.Novelty != 0].copy()
taska_novelty_valid_df = taska_valid_df[taska_valid_df.Novelty != 0].copy()

taska_novelty_train_df['label'] = taska_novelty_train_df.Novelty.apply(lambda x : 1 if x == 1 else 0)
taska_novelty_valid_df['label'] = taska_novelty_valid_df.Novelty.apply(lambda x : 1 if x == 1 else 0)

In [7]:
taska_novelty_train_df.label.value_counts()

0    595
1    123
Name: label, dtype: int64

In [8]:
taska_validity_train_df.label.value_counts()

1    401
0    320
Name: label, dtype: int64

### Evaluate sbert on Validity:

In [9]:
n=10

In [19]:
all_f1_scores = []
for i in range(n):
    trained_model, evaluator = sbert_training.train_model(taska_validity_train_df, taska_validity_valid_df, output_path + '/task-A/validity/sbert/', 
            'sentence-transformers/nli-roberta-large', 
            num_epochs=5, train_batch_size=32,
            model_suffix='', max_seq_length=512, special_tokens=[], 
            loss='ContrastiveLoss', sentence_transformer=False, evaluation_steps=10)
    
    eval_df = sbert_training.predict_labels(taska_validity_valid_df, trained_model, 'Premise', 'Conclusion', 'pred_validity', 0.5)
    
    precision, recall, f1, _ = precision_recall_fscore_support(taska_validity_valid_df.Validity.tolist(), taska_validity_valid_df.pred_validity.tolist(), average='binary')
    print('Precision: {}, Recall {}, F1: {}'.format(precision, recall, f1))
    all_f1_scores.append(f1)

2022-07-04 14:55:41 - Use pytorch device: cuda
2022-07-04 14:55:41 - Read Triplet train dataset
Len of training: 721
Len of Dev: 199
Evaluating before start learning.....
2022-07-04 14:55:41 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 14:55:41 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.6503)
2022-07-04 14:55:41 - F1 with Cosine-Similarity:                 80.69	(Threshold: 0.5985)
2022-07-04 14:55:41 - Precision with Cosine-Similarity:          70.91
2022-07-04 14:55:41 - Recall with Cosine-Similarity:             93.60
2022-07-04 14:55:41 - Average Precision with Cosine-Similarity:  82.54

2022-07-04 14:55:41 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 646.3732)
2022-07-04 14:55:41 - F1 with Manhatten-Distance:                 80.56	(Threshold: 679.7071)
2022-07-04 14:55:41 - Precision with Manhatten-Distance:          71.17
2022-07-04 14:55:41 - Recall with Manhatten-Distance:             92.80
2022-07-04 14:55:41



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

2022-07-04 14:55:43 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 10 steps:
2022-07-04 14:55:43 - Accuracy with Cosine-Similarity:           73.37	(Threshold: 0.7017)
2022-07-04 14:55:43 - F1 with Cosine-Similarity:                 80.13	(Threshold: 0.6247)
2022-07-04 14:55:43 - Precision with Cosine-Similarity:          69.19
2022-07-04 14:55:43 - Recall with Cosine-Similarity:             95.20
2022-07-04 14:55:43 - Average Precision with Cosine-Similarity:  82.63

2022-07-04 14:55:43 - Accuracy with Manhatten-Distance:           73.87	(Threshold: 601.6650)
2022-07-04 14:55:43 - F1 with Manhatten-Distance:                 80.43	(Threshold: 625.3463)
2022-07-04 14:55:43 - Precision with Manhatten-Distance:          72.44
2022-07-04 14:55:43 - Recall with Manhatten-Distance:             90.40
2022-07-04 14:55:43 - Average Precision with Manhatten-Distance:  82.47

2022-07-04 14:55:43 - Accuracy with Euclidean-Distance:           73.37	(Threshold: 24.0499

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Precision: 0.6410256410256411, Recall 1.0, F1: 0.7812500000000001
2022-07-04 14:56:25 - Use pytorch device: cuda
2022-07-04 14:56:25 - Read Triplet train dataset
Len of training: 721
Len of Dev: 199
Evaluating before start learning.....
2022-07-04 14:56:25 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 14:56:25 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.6503)
2022-07-04 14:56:25 - F1 with Cosine-Similarity:                 80.69	(Threshold: 0.5985)
2022-07-04 14:56:25 - Precision with Cosine-Similarity:          70.91
2022-07-04 14:56:25 - Recall with Cosine-Similarity:             93.60
2022-07-04 14:56:25 - Average Precision with Cosine-Similarity:  82.54

2022-07-04 14:56:25 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 646.3732)
2022-07-04 14:56:25 - F1 with Manhatten-Distance:                 80.56	(Threshold: 679.7071)
2022-07-04 14:56:25 - Precision with Manhatten-Distance:          71.17
2022-07-04 14:56:25 - Rec



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

2022-07-04 14:56:27 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 10 steps:
2022-07-04 14:56:27 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.7092)
2022-07-04 14:56:27 - F1 with Cosine-Similarity:                 80.29	(Threshold: 0.6712)
2022-07-04 14:56:27 - Precision with Cosine-Similarity:          72.73
2022-07-04 14:56:27 - Recall with Cosine-Similarity:             89.60
2022-07-04 14:56:27 - Average Precision with Cosine-Similarity:  82.13

2022-07-04 14:56:27 - Accuracy with Manhatten-Distance:           72.86	(Threshold: 597.1991)
2022-07-04 14:56:27 - F1 with Manhatten-Distance:                 80.71	(Threshold: 619.3949)
2022-07-04 14:56:27 - Precision with Manhatten-Distance:          72.90
2022-07-04 14:56:27 - Recall with Manhatten-Distance:             90.40
2022-07-04 14:56:27 - Average Precision with Manhatten-Distance:  82.21

2022-07-04 14:56:27 - Accuracy with Euclidean-Distance:           72.36	(Threshold: 22.9859

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Precision: 0.6410256410256411, Recall 1.0, F1: 0.7812500000000001
2022-07-04 14:57:06 - Use pytorch device: cuda
2022-07-04 14:57:06 - Read Triplet train dataset
Len of training: 721
Len of Dev: 199
Evaluating before start learning.....
2022-07-04 14:57:06 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 14:57:06 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.6503)
2022-07-04 14:57:06 - F1 with Cosine-Similarity:                 80.69	(Threshold: 0.5985)
2022-07-04 14:57:06 - Precision with Cosine-Similarity:          70.91
2022-07-04 14:57:06 - Recall with Cosine-Similarity:             93.60
2022-07-04 14:57:06 - Average Precision with Cosine-Similarity:  82.54

2022-07-04 14:57:06 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 646.3732)
2022-07-04 14:57:06 - F1 with Manhatten-Distance:                 80.56	(Threshold: 679.7071)
2022-07-04 14:57:06 - Precision with Manhatten-Distance:          71.17
2022-07-04 14:57:06 - Rec



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

2022-07-04 14:57:08 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 10 steps:
2022-07-04 14:57:08 - Accuracy with Cosine-Similarity:           74.87	(Threshold: 0.7105)
2022-07-04 14:57:08 - F1 with Cosine-Similarity:                 80.16	(Threshold: 0.7105)
2022-07-04 14:57:08 - Precision with Cosine-Similarity:          79.53
2022-07-04 14:57:08 - Recall with Cosine-Similarity:             80.80
2022-07-04 14:57:08 - Average Precision with Cosine-Similarity:  82.56

2022-07-04 14:57:08 - Accuracy with Manhatten-Distance:           73.37	(Threshold: 576.7632)
2022-07-04 14:57:08 - F1 with Manhatten-Distance:                 80.55	(Threshold: 650.4788)
2022-07-04 14:57:08 - Precision with Manhatten-Distance:          70.24
2022-07-04 14:57:08 - Recall with Manhatten-Distance:             94.40
2022-07-04 14:57:08 - Average Precision with Manhatten-Distance:  82.54

2022-07-04 14:57:08 - Accuracy with Euclidean-Distance:           73.87	(Threshold: 23.1577

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Precision: 0.6410256410256411, Recall 1.0, F1: 0.7812500000000001
2022-07-04 14:57:47 - Use pytorch device: cuda
2022-07-04 14:57:47 - Read Triplet train dataset
Len of training: 721
Len of Dev: 199
Evaluating before start learning.....
2022-07-04 14:57:47 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 14:57:48 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.6503)
2022-07-04 14:57:48 - F1 with Cosine-Similarity:                 80.69	(Threshold: 0.5985)
2022-07-04 14:57:48 - Precision with Cosine-Similarity:          70.91
2022-07-04 14:57:48 - Recall with Cosine-Similarity:             93.60
2022-07-04 14:57:48 - Average Precision with Cosine-Similarity:  82.54

2022-07-04 14:57:48 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 646.3732)
2022-07-04 14:57:48 - F1 with Manhatten-Distance:                 80.56	(Threshold: 679.7071)
2022-07-04 14:57:48 - Precision with Manhatten-Distance:          71.17
2022-07-04 14:57:48 - Rec



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

2022-07-04 14:57:50 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 10 steps:
2022-07-04 14:57:50 - Accuracy with Cosine-Similarity:           74.87	(Threshold: 0.6960)
2022-07-04 14:57:50 - F1 with Cosine-Similarity:                 80.41	(Threshold: 0.6241)
2022-07-04 14:57:50 - Precision with Cosine-Similarity:          69.59
2022-07-04 14:57:50 - Recall with Cosine-Similarity:             95.20
2022-07-04 14:57:50 - Average Precision with Cosine-Similarity:  82.56

2022-07-04 14:57:50 - Accuracy with Manhatten-Distance:           74.37	(Threshold: 583.0293)
2022-07-04 14:57:50 - F1 with Manhatten-Distance:                 80.82	(Threshold: 660.8470)
2022-07-04 14:57:50 - Precision with Manhatten-Distance:          70.66
2022-07-04 14:57:50 - Recall with Manhatten-Distance:             94.40
2022-07-04 14:57:50 - Average Precision with Manhatten-Distance:  82.71

2022-07-04 14:57:50 - Accuracy with Euclidean-Distance:           73.37	(Threshold: 23.3999

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Precision: 0.6410256410256411, Recall 1.0, F1: 0.7812500000000001
2022-07-04 14:58:34 - Use pytorch device: cuda
2022-07-04 14:58:34 - Read Triplet train dataset
Len of training: 721
Len of Dev: 199
Evaluating before start learning.....
2022-07-04 14:58:34 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 14:58:34 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.6503)
2022-07-04 14:58:34 - F1 with Cosine-Similarity:                 80.69	(Threshold: 0.5985)
2022-07-04 14:58:34 - Precision with Cosine-Similarity:          70.91
2022-07-04 14:58:34 - Recall with Cosine-Similarity:             93.60
2022-07-04 14:58:34 - Average Precision with Cosine-Similarity:  82.54

2022-07-04 14:58:34 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 646.3732)
2022-07-04 14:58:34 - F1 with Manhatten-Distance:                 80.56	(Threshold: 679.7071)
2022-07-04 14:58:34 - Precision with Manhatten-Distance:          71.17
2022-07-04 14:58:34 - Rec



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

2022-07-04 14:58:36 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 10 steps:
2022-07-04 14:58:36 - Accuracy with Cosine-Similarity:           73.87	(Threshold: 0.7030)
2022-07-04 14:58:36 - F1 with Cosine-Similarity:                 80.14	(Threshold: 0.6740)
2022-07-04 14:58:36 - Precision with Cosine-Similarity:          73.03
2022-07-04 14:58:36 - Recall with Cosine-Similarity:             88.80
2022-07-04 14:58:36 - Average Precision with Cosine-Similarity:  82.51

2022-07-04 14:58:36 - Accuracy with Manhatten-Distance:           73.37	(Threshold: 595.0889)
2022-07-04 14:58:36 - F1 with Manhatten-Distance:                 80.43	(Threshold: 620.3030)
2022-07-04 14:58:36 - Precision with Manhatten-Distance:          72.44
2022-07-04 14:58:36 - Recall with Manhatten-Distance:             90.40
2022-07-04 14:58:36 - Average Precision with Manhatten-Distance:  82.50

2022-07-04 14:58:36 - Accuracy with Euclidean-Distance:           72.86	(Threshold: 22.6609

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Precision: 0.6410256410256411, Recall 1.0, F1: 0.7812500000000001
2022-07-04 14:59:15 - Use pytorch device: cuda
2022-07-04 14:59:15 - Read Triplet train dataset
Len of training: 721
Len of Dev: 199
Evaluating before start learning.....
2022-07-04 14:59:15 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 14:59:16 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.6503)
2022-07-04 14:59:16 - F1 with Cosine-Similarity:                 80.69	(Threshold: 0.5985)
2022-07-04 14:59:16 - Precision with Cosine-Similarity:          70.91
2022-07-04 14:59:16 - Recall with Cosine-Similarity:             93.60
2022-07-04 14:59:16 - Average Precision with Cosine-Similarity:  82.54

2022-07-04 14:59:16 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 646.3732)
2022-07-04 14:59:16 - F1 with Manhatten-Distance:                 80.56	(Threshold: 679.7071)
2022-07-04 14:59:16 - Precision with Manhatten-Distance:          71.17
2022-07-04 14:59:16 - Rec



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

2022-07-04 14:59:18 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 10 steps:
2022-07-04 14:59:18 - Accuracy with Cosine-Similarity:           72.86	(Threshold: 0.6940)
2022-07-04 14:59:18 - F1 with Cosine-Similarity:                 80.13	(Threshold: 0.6263)
2022-07-04 14:59:18 - Precision with Cosine-Similarity:          69.19
2022-07-04 14:59:18 - Recall with Cosine-Similarity:             95.20
2022-07-04 14:59:18 - Average Precision with Cosine-Similarity:  82.33

2022-07-04 14:59:18 - Accuracy with Manhatten-Distance:           72.86	(Threshold: 624.0936)
2022-07-04 14:59:18 - F1 with Manhatten-Distance:                 80.71	(Threshold: 624.0936)
2022-07-04 14:59:18 - Precision with Manhatten-Distance:          72.90
2022-07-04 14:59:18 - Recall with Manhatten-Distance:             90.40
2022-07-04 14:59:18 - Average Precision with Manhatten-Distance:  82.30

2022-07-04 14:59:18 - Accuracy with Euclidean-Distance:           72.36	(Threshold: 23.9398

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Precision: 0.6410256410256411, Recall 1.0, F1: 0.7812500000000001
2022-07-04 14:59:56 - Use pytorch device: cuda
2022-07-04 14:59:56 - Read Triplet train dataset
Len of training: 721
Len of Dev: 199
Evaluating before start learning.....
2022-07-04 14:59:56 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 14:59:56 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.6503)
2022-07-04 14:59:56 - F1 with Cosine-Similarity:                 80.69	(Threshold: 0.5985)
2022-07-04 14:59:56 - Precision with Cosine-Similarity:          70.91
2022-07-04 14:59:56 - Recall with Cosine-Similarity:             93.60
2022-07-04 14:59:56 - Average Precision with Cosine-Similarity:  82.54

2022-07-04 14:59:56 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 646.3732)
2022-07-04 14:59:56 - F1 with Manhatten-Distance:                 80.56	(Threshold: 679.7071)
2022-07-04 14:59:56 - Precision with Manhatten-Distance:          71.17
2022-07-04 14:59:56 - Rec



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

2022-07-04 14:59:58 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 10 steps:
2022-07-04 14:59:58 - Accuracy with Cosine-Similarity:           74.37	(Threshold: 0.7074)
2022-07-04 14:59:58 - F1 with Cosine-Similarity:                 80.41	(Threshold: 0.6423)
2022-07-04 14:59:58 - Precision with Cosine-Similarity:          69.59
2022-07-04 14:59:58 - Recall with Cosine-Similarity:             95.20
2022-07-04 14:59:58 - Average Precision with Cosine-Similarity:  82.45

2022-07-04 14:59:58 - Accuracy with Manhatten-Distance:           73.37	(Threshold: 575.3694)
2022-07-04 14:59:58 - F1 with Manhatten-Distance:                 80.41	(Threshold: 653.1539)
2022-07-04 14:59:58 - Precision with Manhatten-Distance:          69.59
2022-07-04 14:59:58 - Recall with Manhatten-Distance:             95.20
2022-07-04 14:59:58 - Average Precision with Manhatten-Distance:  82.38

2022-07-04 14:59:58 - Accuracy with Euclidean-Distance:           72.86	(Threshold: 23.0154

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Precision: 0.6410256410256411, Recall 1.0, F1: 0.7812500000000001
2022-07-04 15:00:39 - Use pytorch device: cuda
2022-07-04 15:00:39 - Read Triplet train dataset
Len of training: 721
Len of Dev: 199
Evaluating before start learning.....
2022-07-04 15:00:39 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 15:00:39 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.6503)
2022-07-04 15:00:39 - F1 with Cosine-Similarity:                 80.69	(Threshold: 0.5985)
2022-07-04 15:00:39 - Precision with Cosine-Similarity:          70.91
2022-07-04 15:00:39 - Recall with Cosine-Similarity:             93.60
2022-07-04 15:00:39 - Average Precision with Cosine-Similarity:  82.54

2022-07-04 15:00:39 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 646.3732)
2022-07-04 15:00:39 - F1 with Manhatten-Distance:                 80.56	(Threshold: 679.7071)
2022-07-04 15:00:39 - Precision with Manhatten-Distance:          71.17
2022-07-04 15:00:39 - Rec



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

2022-07-04 15:00:41 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 10 steps:
2022-07-04 15:00:41 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.7093)
2022-07-04 15:00:41 - F1 with Cosine-Similarity:                 80.29	(Threshold: 0.6698)
2022-07-04 15:00:41 - Precision with Cosine-Similarity:          72.73
2022-07-04 15:00:41 - Recall with Cosine-Similarity:             89.60
2022-07-04 15:00:41 - Average Precision with Cosine-Similarity:  82.13

2022-07-04 15:00:41 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 573.3354)
2022-07-04 15:00:41 - F1 with Manhatten-Distance:                 80.28	(Threshold: 625.2722)
2022-07-04 15:00:41 - Precision with Manhatten-Distance:          71.70
2022-07-04 15:00:41 - Recall with Manhatten-Distance:             91.20
2022-07-04 15:00:41 - Average Precision with Manhatten-Distance:  82.17

2022-07-04 15:00:41 - Accuracy with Euclidean-Distance:           72.86	(Threshold: 22.5893

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Precision: 0.6410256410256411, Recall 1.0, F1: 0.7812500000000001
2022-07-04 15:01:23 - Use pytorch device: cuda
2022-07-04 15:01:23 - Read Triplet train dataset
Len of training: 721
Len of Dev: 199
Evaluating before start learning.....
2022-07-04 15:01:23 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 15:01:23 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.6503)
2022-07-04 15:01:23 - F1 with Cosine-Similarity:                 80.69	(Threshold: 0.5985)
2022-07-04 15:01:23 - Precision with Cosine-Similarity:          70.91
2022-07-04 15:01:23 - Recall with Cosine-Similarity:             93.60
2022-07-04 15:01:23 - Average Precision with Cosine-Similarity:  82.54

2022-07-04 15:01:23 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 646.3732)
2022-07-04 15:01:23 - F1 with Manhatten-Distance:                 80.56	(Threshold: 679.7071)
2022-07-04 15:01:23 - Precision with Manhatten-Distance:          71.17
2022-07-04 15:01:23 - Rec



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

2022-07-04 15:01:25 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 10 steps:
2022-07-04 15:01:25 - Accuracy with Cosine-Similarity:           73.87	(Threshold: 0.7043)
2022-07-04 15:01:25 - F1 with Cosine-Similarity:                 80.58	(Threshold: 0.6747)
2022-07-04 15:01:25 - Precision with Cosine-Similarity:          73.20
2022-07-04 15:01:25 - Recall with Cosine-Similarity:             89.60
2022-07-04 15:01:25 - Average Precision with Cosine-Similarity:  82.43

2022-07-04 15:01:25 - Accuracy with Manhatten-Distance:           73.37	(Threshold: 593.6807)
2022-07-04 15:01:25 - F1 with Manhatten-Distance:                 80.43	(Threshold: 619.1473)
2022-07-04 15:01:25 - Precision with Manhatten-Distance:          72.44
2022-07-04 15:01:25 - Recall with Manhatten-Distance:             90.40
2022-07-04 15:01:25 - Average Precision with Manhatten-Distance:  82.42

2022-07-04 15:01:25 - Accuracy with Euclidean-Distance:           72.86	(Threshold: 23.1316

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Precision: 0.6410256410256411, Recall 1.0, F1: 0.7812500000000001
2022-07-04 15:02:05 - Use pytorch device: cuda
2022-07-04 15:02:05 - Read Triplet train dataset
Len of training: 721
Len of Dev: 199
Evaluating before start learning.....
2022-07-04 15:02:05 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 15:02:05 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.6503)
2022-07-04 15:02:05 - F1 with Cosine-Similarity:                 80.69	(Threshold: 0.5985)
2022-07-04 15:02:05 - Precision with Cosine-Similarity:          70.91
2022-07-04 15:02:05 - Recall with Cosine-Similarity:             93.60
2022-07-04 15:02:05 - Average Precision with Cosine-Similarity:  82.54

2022-07-04 15:02:05 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 646.3732)
2022-07-04 15:02:05 - F1 with Manhatten-Distance:                 80.56	(Threshold: 679.7071)
2022-07-04 15:02:05 - Precision with Manhatten-Distance:          71.17
2022-07-04 15:02:05 - Rec



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

2022-07-04 15:02:07 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 10 steps:
2022-07-04 15:02:07 - Accuracy with Cosine-Similarity:           72.86	(Threshold: 0.7045)
2022-07-04 15:02:07 - F1 with Cosine-Similarity:                 80.13	(Threshold: 0.6373)
2022-07-04 15:02:07 - Precision with Cosine-Similarity:          69.19
2022-07-04 15:02:07 - Recall with Cosine-Similarity:             95.20
2022-07-04 15:02:07 - Average Precision with Cosine-Similarity:  82.22

2022-07-04 15:02:07 - Accuracy with Manhatten-Distance:           72.86	(Threshold: 616.0450)
2022-07-04 15:02:07 - F1 with Manhatten-Distance:                 80.71	(Threshold: 616.0450)
2022-07-04 15:02:07 - Precision with Manhatten-Distance:          72.90
2022-07-04 15:02:07 - Recall with Manhatten-Distance:             90.40
2022-07-04 15:02:07 - Average Precision with Manhatten-Distance:  82.26

2022-07-04 15:02:07 - Accuracy with Euclidean-Distance:           72.36	(Threshold: 23.1337

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Precision: 0.6410256410256411, Recall 1.0, F1: 0.7812500000000001


In [22]:
print('Average F1-score: {}'.format(f1))

Average F1-score: 0.7812500000000001


### Evaluate sbert on Novelty:

In [None]:
all_f1_scores = []
for i in range(n):
    trained_model, evaluator = sbert_training.train_model(taska_validity_train_df, taska_validity_valid_df, output_path + '/task-A/validity/sbert/', 
            'sentence-transformers/nli-roberta-large', 
            num_epochs=5, train_batch_size=32,
            model_suffix='', max_seq_length=512, special_tokens=[], 
            loss='ContrastiveLoss', sentence_transformer=False, evaluation_steps=10)
    
    eval_df = sbert_training.predict_labels(taska_validity_valid_df, trained_model, 'Premise', 'Conclusion', 'pred_validity', 0.5)
    
    precision, recall, f1, _ = precision_recall_fscore_support(taska_validity_valid_df.Validity.tolist(), taska_validity_valid_df.pred_validity.tolist(), average='binary')
    print('Precision: {}, Recall {}, F1: {}'.format(precision, recall, f1))
    all_f1_scores.append(f1)