In [2]:
import json
import pandas as pd
import numpy as np
import sys

pd.set_option('display.max_colwidth', None)
sys.path.append('./src-py')

In [3]:
import sbert_training

In [4]:
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryClassificationEvaluator
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import TripletEvaluator
from datetime import datetime
from zipfile import ZipFile

from sentence_transformers.datasets import SentenceLabelDataset
from sentence_transformers.datasets import NoDuplicatesDataLoader

import logging

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

In [5]:
taska_training_df = pd.read_csv('../data/TaskA_train.csv')
taskb_training_df = pd.read_csv('../data/TaskB_train.csv')

taska_valid_df = pd.read_csv('../data/TaskA_dev.csv')
taskb_valid_df = pd.read_csv('../data/TaskB_dev.csv')

In [6]:
#Mapping labels
taska_training_df = taska_training_df[taska_training_df.Validity != 0]
taska_valid_df    = taska_valid_df[taska_valid_df.Validity != 0]

taska_training_df['label'] = taska_training_df.Validity.apply(lambda x : 1 if x == 1 else 0)
taska_valid_df['label'] = taska_valid_df.Validity.apply(lambda x : 1 if x == 1 else 0)

taska_training_df['Premise'] = taska_training_df.apply(lambda x: x['topic'] + ' : ' +  x['Premise'], axis=1)
taska_valid_df['Premise'] = taska_valid_df.apply(lambda x: x['topic'] + ' : ' +  x['Premise'], axis=1)

In [7]:
taska_training_df['input_txt'] = taska_training_df.apply(lambda x: '[CLS] {} [SEP] {} [SEP] {} [SEP]'.format(x['topic'], x['Premise'], x['Conclusion']), axis=1)

In [8]:
taska_training_df.head()

Unnamed: 0,topic,Premise,Conclusion,Validity,Validity-Confidence,Novelty,Novelty-Confidence,label,input_txt
0,TV viewing is harmful to children,"TV viewing is harmful to children : The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",Depression is a well-known psychological problem of modern society that is partly caused by TV watching:,1,confident,1,confident,1,"[CLS] TV viewing is harmful to children [SEP] TV viewing is harmful to children : The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions. [SEP] Depression is a well-known psychological problem of modern society that is partly caused by TV watching: [SEP]"
1,TV viewing is harmful to children,"TV viewing is harmful to children : The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",Children's TV viewing fosters negative emotions,1,very confident,-1,majority,1,"[CLS] TV viewing is harmful to children [SEP] TV viewing is harmful to children : The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions. [SEP] Children's TV viewing fosters negative emotions [SEP]"
2,TV viewing is harmful to children,"TV viewing is harmful to children : The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",Popularity of TV is harmful to children,1,very confident,1,majority,1,"[CLS] TV viewing is harmful to children [SEP] TV viewing is harmful to children : The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions. [SEP] Popularity of TV is harmful to children [SEP]"
3,TV viewing is harmful to children,"TV viewing is harmful to children : The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",Violence on TV and in movies encourages psychological stress,1,very confident,1,majority,1,"[CLS] TV viewing is harmful to children [SEP] TV viewing is harmful to children : The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions. [SEP] Violence on TV and in movies encourages psychological stress [SEP]"
4,TV viewing is harmful to children,"TV viewing is harmful to children : The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",US-India deal does not cap or limit Indian fissile material production.,-1,very confident,-1,very confident,0,"[CLS] TV viewing is harmful to children [SEP] TV viewing is harmful to children : The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions. [SEP] US-India deal does not cap or limit Indian fissile material production. [SEP]"


In [9]:
taska_training_df.label.value_counts()

1    401
0    320
Name: label, dtype: int64

In [10]:
taska_valid_df.label.value_counts()

1    125
0     74
Name: label, dtype: int64

In [27]:
def get_training_examples(df, eval_df, loss):
    
    logger.info("Read Triplet train dataset")
    train_examples = []
    for idx, row in df.iterrows():
        if loss == 'ContrastiveLoss':
            train_examples.append(InputExample(texts=[row['Premise'], row['Conclusion']], label=row['label']))
        elif loss == 'MultipleNegativesRankingLoss':
            if row['label'] == 1:
                train_examples.append(InputExample(texts=[row['Premise'], row['Conclusion']], label=1))
        else:
            train_examples.append(InputExample(texts=[row['anchor'], row['pos'], row['neg']], label=0))
            
    
    dev_samples = []
    for idx, row in eval_df.iterrows():
        dev_samples.append(InputExample(texts=[row['Premise'], row['Conclusion']], label=row['label']))
    
    return train_examples, dev_samples

            
def train_model(df, eval_df, output_path, model_name, num_epochs=3, train_batch_size=16, model_suffix='', \
                data_file_suffix='', max_seq_length=256, 
                special_tokens=[], loss='Triplet', sentence_transformer=False):
    
    output_path = output_path + model_name+ "-" + model_suffix + "-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    if sentence_transformer:
        word_embedding_model = SentenceTransformer(model_name)
        word_embedding_model.max_seq_length = max_seq_length
    else:
        word_embedding_model = models.Transformer(model_name)
        word_embedding_model.max_seq_length = max_seq_length
    
    
    if len(special_tokens) > 0:
        word_embedding_model.tokenizer.add_tokens(special_tokens, special_tokens=True)
        word_embedding_model.resize_token_embeddings(len(word_embedding_model.tokenizer))
        
    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


    train_examples, dev_samples = get_training_examples(df, eval_df, loss)

    print('Len of training: {}'.format(len(train_examples)))
    print('Len of Dev: {}'.format(len(dev_samples)))
    
    if loss == 'MultipleNegativesRankingLoss':
        # Special data loader that avoid duplicates within a batch
        train_dataloader = NoDuplicatesDataLoader(train_examples, batch_size=train_batch_size)
        # Our training loss
        train_loss = losses.MultipleNegativesRankingLoss(model)
    elif loss == 'ContrastiveLoss':
        train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
        train_loss = losses.ContrastiveLoss(model)
    else:
        train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
        train_loss = losses.TripletLoss(model)
    

    evaluator = BinaryClassificationEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')

    warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data


    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              save_best_model=True,
              checkpoint_save_steps=1000,
              checkpoint_save_total_limit=3,
              evaluation_steps=20,
              warmup_steps=warmup_steps,
              output_path=output_path)

In [12]:
train_model(taska_training_df, taska_valid_df, '../data/output/', 'sentence-transformers/nli-roberta-large', num_epochs=20, train_batch_size=16, model_suffix='', data_file_suffix='', max_seq_length=256, special_tokens=[], loss='ContrastiveLoss', sentence_transformer=False)

Downloading:   0%|          | 0.00/673 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

2022-06-25 14:45:33 - Use pytorch device: cuda
2022-06-25 14:45:33 - Read Triplet train dataset


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:46:13 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 0:
2022-06-25 14:46:13 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.8106)
2022-06-25 14:46:13 - F1 with Cosine-Similarity:                 79.87	(Threshold: 0.7468)
2022-06-25 14:46:13 - Precision with Cosine-Similarity:          67.98
2022-06-25 14:46:13 - Recall with Cosine-Similarity:             96.80
2022-06-25 14:46:13 - Average Precision with Cosine-Similarity:  81.21

2022-06-25 14:46:13 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 474.5625)
2022-06-25 14:46:13 - F1 with Manhatten-Distance:                 79.87	(Threshold: 552.6662)
2022-06-25 14:46:13 - Precision with Manhatten-Distance:          67.98
2022-06-25 14:46:13 - Recall with Manhatten-Distance:             96.80
2022-06-25 14:46:13 - Average Precision with Manhatten-Distance:  81.15

2022-06-25 14:46:13 - Accuracy with Euclidean-Distance:           73.37	(Threshold: 18.6928)
2022-06-25

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:46:57 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 1:
2022-06-25 14:46:58 - Accuracy with Cosine-Similarity:           70.85	(Threshold: 0.7664)
2022-06-25 14:46:58 - F1 with Cosine-Similarity:                 80.53	(Threshold: 0.7040)
2022-06-25 14:46:58 - Precision with Cosine-Similarity:          68.54
2022-06-25 14:46:58 - Recall with Cosine-Similarity:             97.60
2022-06-25 14:46:58 - Average Precision with Cosine-Similarity:  81.74

2022-06-25 14:46:58 - Accuracy with Manhatten-Distance:           71.36	(Threshold: 533.6433)
2022-06-25 14:46:58 - F1 with Manhatten-Distance:                 80.53	(Threshold: 592.7543)
2022-06-25 14:46:58 - Precision with Manhatten-Distance:          68.54
2022-06-25 14:46:58 - Recall with Manhatten-Distance:             97.60
2022-06-25 14:46:58 - Average Precision with Manhatten-Distance:  81.83

2022-06-25 14:46:58 - Accuracy with Euclidean-Distance:           70.85	(Threshold: 21.2381)
2022-06-25

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:47:43 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 2:
2022-06-25 14:47:43 - Accuracy with Cosine-Similarity:           67.84	(Threshold: 0.6147)
2022-06-25 14:47:43 - F1 with Cosine-Similarity:                 79.49	(Threshold: 0.6147)
2022-06-25 14:47:43 - Precision with Cosine-Similarity:          66.31
2022-06-25 14:47:43 - Recall with Cosine-Similarity:             99.20
2022-06-25 14:47:43 - Average Precision with Cosine-Similarity:  80.93

2022-06-25 14:47:43 - Accuracy with Manhatten-Distance:           67.84	(Threshold: 557.3306)
2022-06-25 14:47:43 - F1 with Manhatten-Distance:                 79.49	(Threshold: 674.0411)
2022-06-25 14:47:43 - Precision with Manhatten-Distance:          66.31
2022-06-25 14:47:43 - Recall with Manhatten-Distance:             99.20
2022-06-25 14:47:43 - Average Precision with Manhatten-Distance:  81.23

2022-06-25 14:47:43 - Accuracy with Euclidean-Distance:           67.84	(Threshold: 25.7982)
2022-06-25

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:47:50 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 3:
2022-06-25 14:47:50 - Accuracy with Cosine-Similarity:           68.84	(Threshold: 0.6773)
2022-06-25 14:47:50 - F1 with Cosine-Similarity:                 79.33	(Threshold: 0.6773)
2022-06-25 14:47:50 - Precision with Cosine-Similarity:          68.00
2022-06-25 14:47:50 - Recall with Cosine-Similarity:             95.20
2022-06-25 14:47:50 - Average Precision with Cosine-Similarity:  80.33

2022-06-25 14:47:50 - Accuracy with Manhatten-Distance:           68.34	(Threshold: 602.4142)
2022-06-25 14:47:50 - F1 with Manhatten-Distance:                 78.96	(Threshold: 641.9557)
2022-06-25 14:47:50 - Precision with Manhatten-Distance:          66.30
2022-06-25 14:47:50 - Recall with Manhatten-Distance:             97.60
2022-06-25 14:47:50 - Average Precision with Manhatten-Distance:  80.14

2022-06-25 14:47:50 - Accuracy with Euclidean-Distance:           68.84	(Threshold: 24.0574)
2022-06-25

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:47:57 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 4:
2022-06-25 14:47:57 - Accuracy with Cosine-Similarity:           68.84	(Threshold: 0.7195)
2022-06-25 14:47:57 - F1 with Cosine-Similarity:                 79.37	(Threshold: 0.6099)
2022-06-25 14:47:57 - Precision with Cosine-Similarity:          65.79
2022-06-25 14:47:57 - Recall with Cosine-Similarity:             100.00
2022-06-25 14:47:57 - Average Precision with Cosine-Similarity:  80.13

2022-06-25 14:47:57 - Accuracy with Manhatten-Distance:           69.35	(Threshold: 572.5992)
2022-06-25 14:47:57 - F1 with Manhatten-Distance:                 79.35	(Threshold: 630.6748)
2022-06-25 14:47:57 - Precision with Manhatten-Distance:          66.49
2022-06-25 14:47:57 - Recall with Manhatten-Distance:             98.40
2022-06-25 14:47:57 - Average Precision with Manhatten-Distance:  80.16

2022-06-25 14:47:57 - Accuracy with Euclidean-Distance:           68.84	(Threshold: 23.3673)
2022-06-2

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:48:04 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 5:
2022-06-25 14:48:04 - Accuracy with Cosine-Similarity:           69.85	(Threshold: 0.6816)
2022-06-25 14:48:04 - F1 with Cosine-Similarity:                 79.87	(Threshold: 0.6160)
2022-06-25 14:48:04 - Precision with Cosine-Similarity:          67.98
2022-06-25 14:48:04 - Recall with Cosine-Similarity:             96.80
2022-06-25 14:48:04 - Average Precision with Cosine-Similarity:  78.26

2022-06-25 14:48:04 - Accuracy with Manhatten-Distance:           68.84	(Threshold: 629.8688)
2022-06-25 14:48:04 - F1 with Manhatten-Distance:                 79.62	(Threshold: 750.1376)
2022-06-25 14:48:04 - Precision with Manhatten-Distance:          66.14
2022-06-25 14:48:04 - Recall with Manhatten-Distance:             100.00
2022-06-25 14:48:04 - Average Precision with Manhatten-Distance:  78.49

2022-06-25 14:48:04 - Accuracy with Euclidean-Distance:           69.85	(Threshold: 24.8456)
2022-06-2

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:48:10 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 6:
2022-06-25 14:48:10 - Accuracy with Cosine-Similarity:           69.85	(Threshold: 0.7180)
2022-06-25 14:48:10 - F1 with Cosine-Similarity:                 80.13	(Threshold: 0.6692)
2022-06-25 14:48:10 - Precision with Cosine-Similarity:          68.36
2022-06-25 14:48:10 - Recall with Cosine-Similarity:             96.80
2022-06-25 14:48:10 - Average Precision with Cosine-Similarity:  80.14

2022-06-25 14:48:10 - Accuracy with Manhatten-Distance:           70.85	(Threshold: 592.8418)
2022-06-25 14:48:10 - F1 with Manhatten-Distance:                 80.79	(Threshold: 603.7699)
2022-06-25 14:48:10 - Precision with Manhatten-Distance:          68.93
2022-06-25 14:48:10 - Recall with Manhatten-Distance:             97.60
2022-06-25 14:48:10 - Average Precision with Manhatten-Distance:  80.35

2022-06-25 14:48:10 - Accuracy with Euclidean-Distance:           70.35	(Threshold: 24.0450)
2022-06-25

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:48:17 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 7:
2022-06-25 14:48:17 - Accuracy with Cosine-Similarity:           71.36	(Threshold: 0.6704)
2022-06-25 14:48:17 - F1 with Cosine-Similarity:                 80.94	(Threshold: 0.6704)
2022-06-25 14:48:17 - Precision with Cosine-Similarity:          69.54
2022-06-25 14:48:17 - Recall with Cosine-Similarity:             96.80
2022-06-25 14:48:17 - Average Precision with Cosine-Similarity:  78.50

2022-06-25 14:48:17 - Accuracy with Manhatten-Distance:           69.85	(Threshold: 596.6606)
2022-06-25 14:48:17 - F1 with Manhatten-Distance:                 80.39	(Threshold: 648.5768)
2022-06-25 14:48:17 - Precision with Manhatten-Distance:          67.96
2022-06-25 14:48:17 - Recall with Manhatten-Distance:             98.40
2022-06-25 14:48:17 - Average Precision with Manhatten-Distance:  78.42

2022-06-25 14:48:17 - Accuracy with Euclidean-Distance:           71.36	(Threshold: 25.2994)
2022-06-25

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:48:24 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 8:
2022-06-25 14:48:24 - Accuracy with Cosine-Similarity:           70.35	(Threshold: 0.6866)
2022-06-25 14:48:24 - F1 with Cosine-Similarity:                 79.86	(Threshold: 0.6866)
2022-06-25 14:48:24 - Precision with Cosine-Similarity:          69.64
2022-06-25 14:48:24 - Recall with Cosine-Similarity:             93.60
2022-06-25 14:48:24 - Average Precision with Cosine-Similarity:  77.69

2022-06-25 14:48:24 - Accuracy with Manhatten-Distance:           68.84	(Threshold: 630.8164)
2022-06-25 14:48:24 - F1 with Manhatten-Distance:                 79.49	(Threshold: 683.0304)
2022-06-25 14:48:24 - Precision with Manhatten-Distance:          66.31
2022-06-25 14:48:24 - Recall with Manhatten-Distance:             99.20
2022-06-25 14:48:24 - Average Precision with Manhatten-Distance:  77.91

2022-06-25 14:48:24 - Accuracy with Euclidean-Distance:           70.35	(Threshold: 24.7897)
2022-06-25

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:48:31 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 9:
2022-06-25 14:48:31 - Accuracy with Cosine-Similarity:           70.35	(Threshold: 0.6602)
2022-06-25 14:48:31 - F1 with Cosine-Similarity:                 80.40	(Threshold: 0.6501)
2022-06-25 14:48:31 - Precision with Cosine-Similarity:          68.75
2022-06-25 14:48:31 - Recall with Cosine-Similarity:             96.80
2022-06-25 14:48:31 - Average Precision with Cosine-Similarity:  78.19

2022-06-25 14:48:31 - Accuracy with Manhatten-Distance:           69.35	(Threshold: 654.9673)
2022-06-25 14:48:31 - F1 with Manhatten-Distance:                 80.13	(Threshold: 664.1875)
2022-06-25 14:48:31 - Precision with Manhatten-Distance:          67.58
2022-06-25 14:48:31 - Recall with Manhatten-Distance:             98.40
2022-06-25 14:48:31 - Average Precision with Manhatten-Distance:  78.28

2022-06-25 14:48:31 - Accuracy with Euclidean-Distance:           70.85	(Threshold: 26.2027)
2022-06-25

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:48:37 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 10:
2022-06-25 14:48:38 - Accuracy with Cosine-Similarity:           67.84	(Threshold: 0.6477)
2022-06-25 14:48:38 - F1 with Cosine-Similarity:                 79.37	(Threshold: 0.5293)
2022-06-25 14:48:38 - Precision with Cosine-Similarity:          65.79
2022-06-25 14:48:38 - Recall with Cosine-Similarity:             100.00
2022-06-25 14:48:38 - Average Precision with Cosine-Similarity:  77.24

2022-06-25 14:48:38 - Accuracy with Manhatten-Distance:           67.84	(Threshold: 644.2015)
2022-06-25 14:48:38 - F1 with Manhatten-Distance:                 79.37	(Threshold: 725.1140)
2022-06-25 14:48:38 - Precision with Manhatten-Distance:          65.79
2022-06-25 14:48:38 - Recall with Manhatten-Distance:             100.00
2022-06-25 14:48:38 - Average Precision with Manhatten-Distance:  77.18

2022-06-25 14:48:38 - Accuracy with Euclidean-Distance:           68.34	(Threshold: 26.4855)
2022-06

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:48:44 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 11:
2022-06-25 14:48:44 - Accuracy with Cosine-Similarity:           68.34	(Threshold: 0.5525)
2022-06-25 14:48:44 - F1 with Cosine-Similarity:                 79.74	(Threshold: 0.5525)
2022-06-25 14:48:44 - Precision with Cosine-Similarity:          66.67
2022-06-25 14:48:44 - Recall with Cosine-Similarity:             99.20
2022-06-25 14:48:44 - Average Precision with Cosine-Similarity:  77.27

2022-06-25 14:48:44 - Accuracy with Manhatten-Distance:           68.84	(Threshold: 645.3147)
2022-06-25 14:48:44 - F1 with Manhatten-Distance:                 79.74	(Threshold: 645.3147)
2022-06-25 14:48:44 - Precision with Manhatten-Distance:          67.40
2022-06-25 14:48:44 - Recall with Manhatten-Distance:             97.60
2022-06-25 14:48:44 - Average Precision with Manhatten-Distance:  76.85

2022-06-25 14:48:44 - Accuracy with Euclidean-Distance:           68.34	(Threshold: 26.5242)
2022-06-2

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:48:51 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 12:
2022-06-25 14:48:51 - Accuracy with Cosine-Similarity:           68.84	(Threshold: 0.6763)
2022-06-25 14:48:51 - F1 with Cosine-Similarity:                 79.74	(Threshold: 0.5380)
2022-06-25 14:48:51 - Precision with Cosine-Similarity:          66.67
2022-06-25 14:48:51 - Recall with Cosine-Similarity:             99.20
2022-06-25 14:48:51 - Average Precision with Cosine-Similarity:  77.61

2022-06-25 14:48:51 - Accuracy with Manhatten-Distance:           68.84	(Threshold: 641.4353)
2022-06-25 14:48:51 - F1 with Manhatten-Distance:                 79.49	(Threshold: 701.6359)
2022-06-25 14:48:51 - Precision with Manhatten-Distance:          66.31
2022-06-25 14:48:51 - Recall with Manhatten-Distance:             99.20
2022-06-25 14:48:51 - Average Precision with Manhatten-Distance:  77.44

2022-06-25 14:48:51 - Accuracy with Euclidean-Distance:           68.84	(Threshold: 25.3202)
2022-06-2

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:48:58 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 13:
2022-06-25 14:48:58 - Accuracy with Cosine-Similarity:           68.84	(Threshold: 0.6441)
2022-06-25 14:48:58 - F1 with Cosine-Similarity:                 79.62	(Threshold: 0.5169)
2022-06-25 14:48:58 - Precision with Cosine-Similarity:          66.14
2022-06-25 14:48:58 - Recall with Cosine-Similarity:             100.00
2022-06-25 14:48:58 - Average Precision with Cosine-Similarity:  76.70

2022-06-25 14:48:58 - Accuracy with Manhatten-Distance:           68.34	(Threshold: 628.8334)
2022-06-25 14:48:58 - F1 with Manhatten-Distance:                 79.49	(Threshold: 684.9885)
2022-06-25 14:48:58 - Precision with Manhatten-Distance:          66.31
2022-06-25 14:48:58 - Recall with Manhatten-Distance:             99.20
2022-06-25 14:48:58 - Average Precision with Manhatten-Distance:  76.53

2022-06-25 14:48:58 - Accuracy with Euclidean-Distance:           68.84	(Threshold: 26.5013)
2022-06-

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:49:05 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 14:
2022-06-25 14:49:05 - Accuracy with Cosine-Similarity:           68.84	(Threshold: 0.6492)
2022-06-25 14:49:05 - F1 with Cosine-Similarity:                 79.62	(Threshold: 0.5005)
2022-06-25 14:49:05 - Precision with Cosine-Similarity:          66.14
2022-06-25 14:49:05 - Recall with Cosine-Similarity:             100.00
2022-06-25 14:49:05 - Average Precision with Cosine-Similarity:  76.71

2022-06-25 14:49:05 - Accuracy with Manhatten-Distance:           68.84	(Threshold: 637.6531)
2022-06-25 14:49:05 - F1 with Manhatten-Distance:                 79.74	(Threshold: 646.8878)
2022-06-25 14:49:05 - Precision with Manhatten-Distance:          67.40
2022-06-25 14:49:05 - Recall with Manhatten-Distance:             97.60
2022-06-25 14:49:05 - Average Precision with Manhatten-Distance:  76.50

2022-06-25 14:49:05 - Accuracy with Euclidean-Distance:           68.84	(Threshold: 26.1983)
2022-06-

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:49:11 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 15:
2022-06-25 14:49:12 - Accuracy with Cosine-Similarity:           68.34	(Threshold: 0.5617)
2022-06-25 14:49:12 - F1 with Cosine-Similarity:                 79.74	(Threshold: 0.5379)
2022-06-25 14:49:12 - Precision with Cosine-Similarity:          66.67
2022-06-25 14:49:12 - Recall with Cosine-Similarity:             99.20
2022-06-25 14:49:12 - Average Precision with Cosine-Similarity:  77.54

2022-06-25 14:49:12 - Accuracy with Manhatten-Distance:           68.84	(Threshold: 656.0486)
2022-06-25 14:49:12 - F1 with Manhatten-Distance:                 79.61	(Threshold: 656.0486)
2022-06-25 14:49:12 - Precision with Manhatten-Distance:          67.60
2022-06-25 14:49:12 - Recall with Manhatten-Distance:             96.80
2022-06-25 14:49:12 - Average Precision with Manhatten-Distance:  77.07

2022-06-25 14:49:12 - Accuracy with Euclidean-Distance:           68.34	(Threshold: 29.4185)
2022-06-2

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:49:18 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 16:
2022-06-25 14:49:18 - Accuracy with Cosine-Similarity:           68.34	(Threshold: 0.5902)
2022-06-25 14:49:18 - F1 with Cosine-Similarity:                 79.49	(Threshold: 0.5343)
2022-06-25 14:49:18 - Precision with Cosine-Similarity:          66.31
2022-06-25 14:49:18 - Recall with Cosine-Similarity:             99.20
2022-06-25 14:49:18 - Average Precision with Cosine-Similarity:  77.02

2022-06-25 14:49:18 - Accuracy with Manhatten-Distance:           68.84	(Threshold: 648.6454)
2022-06-25 14:49:18 - F1 with Manhatten-Distance:                 79.61	(Threshold: 648.6454)
2022-06-25 14:49:18 - Precision with Manhatten-Distance:          67.60
2022-06-25 14:49:18 - Recall with Manhatten-Distance:             96.80
2022-06-25 14:49:18 - Average Precision with Manhatten-Distance:  76.89

2022-06-25 14:49:18 - Accuracy with Euclidean-Distance:           68.34	(Threshold: 27.3563)
2022-06-2

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:49:25 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 17:
2022-06-25 14:49:25 - Accuracy with Cosine-Similarity:           67.84	(Threshold: 0.6526)
2022-06-25 14:49:25 - F1 with Cosine-Similarity:                 79.49	(Threshold: 0.5336)
2022-06-25 14:49:25 - Precision with Cosine-Similarity:          66.31
2022-06-25 14:49:25 - Recall with Cosine-Similarity:             99.20
2022-06-25 14:49:25 - Average Precision with Cosine-Similarity:  76.88

2022-06-25 14:49:25 - Accuracy with Manhatten-Distance:           67.84	(Threshold: 640.6236)
2022-06-25 14:49:25 - F1 with Manhatten-Distance:                 79.35	(Threshold: 670.8201)
2022-06-25 14:49:25 - Precision with Manhatten-Distance:          66.49
2022-06-25 14:49:25 - Recall with Manhatten-Distance:             98.40
2022-06-25 14:49:25 - Average Precision with Manhatten-Distance:  76.64

2022-06-25 14:49:25 - Accuracy with Euclidean-Distance:           67.84	(Threshold: 26.1675)
2022-06-2

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:49:32 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 18:
2022-06-25 14:49:32 - Accuracy with Cosine-Similarity:           67.84	(Threshold: 0.6517)
2022-06-25 14:49:32 - F1 with Cosine-Similarity:                 79.49	(Threshold: 0.5324)
2022-06-25 14:49:32 - Precision with Cosine-Similarity:          66.31
2022-06-25 14:49:32 - Recall with Cosine-Similarity:             99.20
2022-06-25 14:49:32 - Average Precision with Cosine-Similarity:  77.18

2022-06-25 14:49:32 - Accuracy with Manhatten-Distance:           67.84	(Threshold: 642.7328)
2022-06-25 14:49:32 - F1 with Manhatten-Distance:                 79.23	(Threshold: 709.7477)
2022-06-25 14:49:32 - Precision with Manhatten-Distance:          65.96
2022-06-25 14:49:32 - Recall with Manhatten-Distance:             99.20
2022-06-25 14:49:32 - Average Precision with Manhatten-Distance:  76.72

2022-06-25 14:49:32 - Accuracy with Euclidean-Distance:           67.84	(Threshold: 26.2034)
2022-06-2

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

2022-06-25 14:49:39 - Binary Accuracy Evaluation of the model on sts-dev dataset after epoch 19:
2022-06-25 14:49:39 - Accuracy with Cosine-Similarity:           67.84	(Threshold: 0.6506)
2022-06-25 14:49:39 - F1 with Cosine-Similarity:                 79.49	(Threshold: 0.5303)
2022-06-25 14:49:39 - Precision with Cosine-Similarity:          66.31
2022-06-25 14:49:39 - Recall with Cosine-Similarity:             99.20
2022-06-25 14:49:39 - Average Precision with Cosine-Similarity:  77.10

2022-06-25 14:49:39 - Accuracy with Manhatten-Distance:           68.34	(Threshold: 651.1418)
2022-06-25 14:49:39 - F1 with Manhatten-Distance:                 79.37	(Threshold: 745.2222)
2022-06-25 14:49:39 - Precision with Manhatten-Distance:          65.79
2022-06-25 14:49:39 - Recall with Manhatten-Distance:             100.00
2022-06-25 14:49:39 - Average Precision with Manhatten-Distance:  76.70

2022-06-25 14:49:39 - Accuracy with Euclidean-Distance:           67.84	(Threshold: 26.2521)
2022-06-

In [28]:
train_model(taska_training_df, taska_valid_df, '../data/output/', 
            'sentence-transformers/nli-roberta-large', 
            num_epochs=3, train_batch_size=8,
            model_suffix='ranking-loss', max_seq_length=256, special_tokens=[], 
            loss='MultipleNegativesRankingLoss', sentence_transformer=False)

2022-06-26 19:16:36 - Use pytorch device: cuda
2022-06-26 19:16:36 - Read Triplet train dataset
Len of training: 401
Len of Dev: 199


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/50 [00:00<?, ?it/s]

2022-06-26 19:16:39 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 20 steps:
2022-06-26 19:16:39 - Accuracy with Cosine-Similarity:           71.86	(Threshold: 0.7185)
2022-06-26 19:16:39 - F1 with Cosine-Similarity:                 79.45	(Threshold: 0.6203)
2022-06-26 19:16:39 - Precision with Cosine-Similarity:          69.46
2022-06-26 19:16:39 - Recall with Cosine-Similarity:             92.80
2022-06-26 19:16:39 - Average Precision with Cosine-Similarity:  83.44

2022-06-26 19:16:39 - Accuracy with Manhatten-Distance:           74.87	(Threshold: 598.0983)
2022-06-26 19:16:39 - F1 with Manhatten-Distance:                 80.92	(Threshold: 611.8794)
2022-06-26 19:16:39 - Precision with Manhatten-Distance:          77.37
2022-06-26 19:16:39 - Recall with Manhatten-Distance:             84.80
2022-06-26 19:16:39 - Average Precision with Manhatten-Distance:  83.71

2022-06-26 19:16:39 - Accuracy with Euclidean-Distance:           72.36	(Threshold: 22.8899

Iteration:   0%|          | 0/50 [00:00<?, ?it/s]

2022-06-26 19:17:10 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 1 after 20 steps:
2022-06-26 19:17:10 - Accuracy with Cosine-Similarity:           66.83	(Threshold: 0.4866)
2022-06-26 19:17:10 - F1 with Cosine-Similarity:                 79.11	(Threshold: 0.4671)
2022-06-26 19:17:10 - Precision with Cosine-Similarity:          65.45
2022-06-26 19:17:10 - Recall with Cosine-Similarity:             100.00
2022-06-26 19:17:10 - Average Precision with Cosine-Similarity:  80.12

2022-06-26 19:17:10 - Accuracy with Manhatten-Distance:           67.34	(Threshold: 700.8440)
2022-06-26 19:17:10 - F1 with Manhatten-Distance:                 79.11	(Threshold: 803.3531)
2022-06-26 19:17:10 - Precision with Manhatten-Distance:          65.45
2022-06-26 19:17:10 - Recall with Manhatten-Distance:             100.00
2022-06-26 19:17:10 - Average Precision with Manhatten-Distance:  81.33

2022-06-26 19:17:10 - Accuracy with Euclidean-Distance:           66.83	(Threshold: 30.51

Iteration:   0%|          | 0/50 [00:00<?, ?it/s]

2022-06-26 19:17:18 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 2 after 20 steps:
2022-06-26 19:17:18 - Accuracy with Cosine-Similarity:           66.33	(Threshold: 0.6001)
2022-06-26 19:17:18 - F1 with Cosine-Similarity:                 78.86	(Threshold: 0.4173)
2022-06-26 19:17:18 - Precision with Cosine-Similarity:          65.10
2022-06-26 19:17:18 - Recall with Cosine-Similarity:             100.00
2022-06-26 19:17:18 - Average Precision with Cosine-Similarity:  80.45

2022-06-26 19:17:18 - Accuracy with Manhatten-Distance:           67.84	(Threshold: 586.1477)
2022-06-26 19:17:18 - F1 with Manhatten-Distance:                 79.11	(Threshold: 820.2714)
2022-06-26 19:17:18 - Precision with Manhatten-Distance:          65.45
2022-06-26 19:17:18 - Recall with Manhatten-Distance:             100.00
2022-06-26 19:17:18 - Average Precision with Manhatten-Distance:  81.52

2022-06-26 19:17:18 - Accuracy with Euclidean-Distance:           66.83	(Threshold: 27.34

### Using the auto-generated conclusions:

In [None]:
taska_training_df = pd.read_csv('../data/')
taska_valid_df = pd.read_csv('../data/TaskA_dev.csv')

In [None]:
#Mapping labels
taska_training_df = taska_training_df[taska_training_df.Validity != 0]
taska_valid_df    = taska_valid_df[taska_valid_df.Validity != 0]

taska_training_df['label'] = taska_training_df.Validity.apply(lambda x : 1 if x == 1 else 0)
taska_valid_df['label'] = taska_valid_df.Validity.apply(lambda x : 1 if x == 1 else 0)

taska_training_df['Premise'] = taska_training_df.apply(lambda x: x['topic'] + ' : ' +  x['Premise'], axis=1)
taska_valid_df['Premise'] = taska_valid_df.apply(lambda x: x['topic'] + ' : ' +  x['Premise'], axis=1)