# Textual Entailment

In [1]:
!nvidia-smi
!lscpu

Wed Aug  4 13:49:31 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P8    32W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install hazm==0.7.0
!pip install seqeval==1.2.2
!pip install sentencepiece==0.1.96
!pip install transformers==4.7.0
!pip install sentence-transformers==2.0.0

Collecting hazm==0.7.0
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[K     |████████████████████████████████| 316 kB 4.1 MB/s 
[?25hCollecting nltk==3.3
  Downloading nltk-3.3.0.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 7.3 MB/s 
[?25hCollecting libwapiti>=0.2.1
  Downloading libwapiti-0.2.1.tar.gz (233 kB)
[K     |████████████████████████████████| 233 kB 18.2 MB/s 
Building wheels for collected packages: nltk, libwapiti
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-py3-none-any.whl size=1394490 sha256=0c8c9124afdeea24b8d35cac17a04789385683f08a0713fb3fdcc323adc13c06
  Stored in directory: /root/.cache/pip/wheels/9b/fd/0c/d92302c876e5de87ebd7fc0979d82edb93e2d8d768bf71fac4
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Created wheel for libwapiti: filename=libwapiti-0.2.1-cp37-cp37m-linux_x86_64.whl size=154221 sha256=c309a719ade78c4d63e67fd0d27998c3863d6d7f101307f5c640123bcdad4804
  

In [3]:
!pip install PyDrive
import os
import IPython.display as ipd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [4]:
# Import required packages
import os
import gc
import re
import hazm
import time
import json
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import transformers
from transformers import AutoConfig, AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import MT5Config, MT5ForConditionalGeneration, MT5Tokenizer

from sentence_transformers import models, SentenceTransformer, util

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print()
print('numpy', np.__version__)
print('pandas', pd.__version__)
print('transformers', transformers.__version__)
print('torch', torch.__version__)
print()

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")



numpy 1.19.5
pandas 1.1.5
transformers 4.7.0
torch 1.9.0+cu102

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [5]:
class TextualEntailmentDataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Textual Entailment. """

    def __init__(self, premises_hypotheses, targets, label_list, tokenizer, model_type, max_length):
        self.premises_hypotheses = premises_hypotheses
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.targets = targets
        self.max_length = max_length
        self.label2index = {label: i for i, label in enumerate(label_list)} if isinstance(label_list, list) else {}
        self.index2label = {i: label for label, i in self.label2index.items()}

    def __len__(self):
        return len(self.premises_hypotheses)

    def __getitem__(self, item):
        encoding = self.tokenizer(
            self.premises_hypotheses[item],
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        if self.model_type == "mt5":
            inputs = {
                'premise': self.premises_hypotheses[item].split('<sep>')[0],
                'hypothesis': self.premises_hypotheses[item].split('<sep>')[1],
                'pair': self.premises_hypotheses[item],
                'targets': torch.tensor(self.label2index[self.targets[item]], dtype=torch.long),
                'original_targets': self.targets[item],
                'input_ids': encoding.input_ids.flatten(),
                'attention_mask': encoding.attention_mask.flatten()
            }
        else:
            inputs = {
                'premise': self.premises_hypotheses[item].split('<sep>')[0],
                'hypothesis': self.premises_hypotheses[item].split('<sep>')[1],
                'pair': self.premises_hypotheses[item],
                'targets': torch.tensor(self.label2index[self.targets[item]], dtype=torch.long),
                'original_targets': self.targets[item],
                'input_ids': encoding.input_ids.flatten(),
                'attention_mask': encoding.attention_mask.flatten(),
                'token_type_ids': encoding['token_type_ids'].flatten()
            }
        return inputs


class TextualEntailmentDataset2(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Textual Entailment. """

    def __init__(self, premises, hypotheses, targets, label_list, tokenizer, model_type, max_length):
        self.premises = premises
        self.hypotheses = hypotheses
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.targets = targets
        self.max_length = max_length
        self.label2index = {label: i for i, label in enumerate(label_list)} if isinstance(label_list, list) else {}
        self.index2label = {i: label for label, i in self.label2index.items()}

    def __len__(self):
        return len(self.premises)

    def __getitem__(self, item):
        encoding = self.tokenizer(
            [(self.premises[item], self.hypotheses[item])],
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        if self.model_type == "mt5":
            inputs = {
                'premise': self.premises[item],
                'hypothesis': self.hypotheses[item],
                'targets': torch.tensor(self.label2index[self.targets[item]], dtype=torch.long),
                'original_targets': self.targets[item],
                'input_ids': encoding.input_ids.flatten(),
                'attention_mask': encoding.attention_mask.flatten()
            }
        else:
            inputs = {
                'premise': self.premises[item],
                'hypothesis': self.hypotheses[item],
                'targets': torch.tensor(self.label2index[self.targets[item]], dtype=torch.long),
                'original_targets': self.targets[item],
                'input_ids': encoding.input_ids.flatten(),
                'attention_mask': encoding.attention_mask.flatten(),
                'token_type_ids': encoding['token_type_ids'].flatten()
            }
        return inputs


class TextualEntailmentSimilarityDataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Textual Entailment. """

    def __init__(self, premises, hypotheses, targets):
        self.premises = premises
        self.hypotheses = hypotheses
        self.targets = targets

    def __len__(self):
        return len(self.premises)

    def __getitem__(self, item):
        inputs = {
            'item': item,
            'premise': self.premises[item],
            'hypothesis': self.hypotheses[item],
            'targets': self.targets[item]
        }
        return inputs


class TextualEntailment:
    def __init__(self, model_name, model_type, label_list):
        self.normalizer = hazm.Normalizer()
        self.model_name = model_name
        self.model_type = model_type.lower()
        self.label_list = label_list
        if self.model_type == "mt5":
            self.tokenizer = MT5Tokenizer.from_pretrained(model_name)
            self.model = MT5ForConditionalGeneration.from_pretrained(model_name)
            self.config = MT5Config.from_pretrained(self.model_name)
        elif self.model_type == "sentence-transformer":
            word_embedding_model = models.Transformer(model_name)
            pooling_model = models.Pooling(
                word_embedding_model.get_word_embedding_dimension(),
                pooling_mode_mean_tokens=True,
                pooling_mode_cls_token=False,
                pooling_mode_max_tokens=False
            )
            self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.config = AutoConfig.from_pretrained(self.model_name)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
            self.config = AutoConfig.from_pretrained(self.model_name)
            self.id2label = self.config.id2label
            self.label2id = self.config.label2id

    def load_dataset_test_file(self, dataset_name, dataset_file, **kwargs):
        if dataset_name.lower() in ["parsinlu-natural", "parsinlu-mnli", "parsinlu-farstail"]:
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            data = pd.read_csv(dataset_file, delimiter="\t", names=['premise_hypothesis', 'label'], header=None)

            # cleaning labels
            valid_labels = ['e', 'n', 'c']
            data['label'] = data['label'].apply(lambda r: r if r in valid_labels else None)
            data = data.dropna(subset=['label'])
            data = data.reset_index(drop=True)

            if 'label_map' in kwargs:
                data['label'] = data['label'].apply(lambda l: kwargs['label_map'][l])

            premise_hypothesis, labels = data['premise_hypothesis'].values.tolist(), data['label'].values.tolist()
            print(f'test part:\n #premise_hypothesis: {len(premise_hypothesis)}, #label: {len(labels)}')
            return premise_hypothesis, labels
        if dataset_name.lower() == "farstail":
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            data = pd.read_csv(dataset_file, sep='\t')

            if 'label_map' in kwargs:
                data['label'] = data['label'].apply(lambda l: kwargs['label_map'][l])

            premises = data['premise'].values.tolist()
            hypotheses = data['hypothesis'].values.tolist()
            labels = data['label'].values.tolist()
            print(f'test part:\n #premise: {len(premises)}, #hypothesis: {len(hypotheses)}, #label: {len(labels)}')
            return premises, hypotheses, labels

    def textual_entailment_inference(self, premises, hypotheses, device, max_length):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        new_input = []
        for p, h in zip(premises, hypotheses):
            new_input.append((p, h))

        tokenized_batch = self.tokenizer(
            new_input,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        tokenized_batch = tokenized_batch.to(device)
        outputs = self.model(**tokenized_batch)
        pt_predictions = torch.argmax(F.softmax(outputs.logits, dim=1), dim=1)

        output_predictions = []
        for i, premise in enumerate(premises):
            output_predictions.append(
                (premise, hypotheses[i], pt_predictions[i].item(), self.label_list[pt_predictions[i].item()])
            )
        return output_predictions

    def mt5_textual_entailment_inference(self, premises, hypotheses, device):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return

        new_input = []
        for p, h in zip(premises, hypotheses):
            new_input.append(p + "<sep>" + h)

        tokenized_batch = self.tokenizer(
            new_input,
            padding=True,
            return_tensors="pt"
        )

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        input_ids = tokenized_batch.input_ids.to(device)
        attention_mask = tokenized_batch.attention_mask.to(device)
        outputs = self.model.generate(input_ids=input_ids,
                                      attention_mask=attention_mask)
        predictions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return predictions

    def sentence_transformer_textual_entailment_inference(self, premises, hypotheses, device, label_map=None):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return
        if label_map is None:
            label_map = {"0<=score<0.4": "contradiction", "0.4<=score<=0.6": "neutral", "0.6<score<=1": "entailment"}
            print(f"Setting default value for label map: {label_map}")
        elif not all('score' in cond for cond in label_map.keys()):
            print(f"All the key of label_map must contain a condition on `score` variable.\n"
                  f"For example: label_map ={label_map}")
            return

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        premises_embeddings = self.model.encode(premises, convert_to_tensor=True, show_progress_bar=True)
        hypotheses_embeddings = self.model.encode(hypotheses, convert_to_tensor=True, show_progress_bar=True)

        # Compute the pair-wise cosine similarities
        similarity_scores, predicted_labels = [], []
        for i in range(len(premises)):
            cosine_score = \
                util.pytorch_cos_sim(premises_embeddings[i], hypotheses_embeddings[i]).cpu().detach().numpy()[0][0]
            similarity_scores.append(cosine_score)
            predicted_label = ''
            for exp in label_map:
                if eval(exp.replace('score', str(cosine_score))):
                    predicted_label = label_map[exp]
                    break
            predicted_labels.append(predicted_label)

        output_predictions = []
        for i, premise in enumerate(premises):
            output_predictions.append(
                (premise, hypotheses[i], similarity_scores[i], predicted_labels[i])
            )
        return output_predictions

    def evaluation(self, premise_hypothesis, labels, device, max_length, batch_size=4):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return
        label_count = {label: labels.count(label) for label in labels}
        print("label_count:", label_count)
        dataset = TextualEntailmentDataset(premises_hypotheses=premise_hypothesis, targets=labels,
                                           label_list=self.label_list, tokenizer=self.tokenizer,
                                           model_type=self.model_type, max_length=max_length)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print(f'#premise_hypothesis:{len(premise_hypothesis)}, #labels:{len(labels)}')
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        total_loss, total_time = 0, 0
        output_predictions = []
        golden_labels, predicted_labels = [], []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_premises = batch['premise']
            b_input_ids = batch['input_ids']
            b_attention_mask = batch['attention_mask']
            b_token_type_ids = batch['token_type_ids']
            b_targets = batch['targets']

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)
            b_token_type_ids = b_token_type_ids.to(device)
            b_targets = b_targets.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                b_outputs = self.model(input_ids=b_input_ids, attention_mask=b_attention_mask,
                                       token_type_ids=b_token_type_ids, labels=b_targets)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')
            # get the loss
            total_loss += b_outputs.loss.item()

            b_original_targets = batch['original_targets']
            golden_labels.extend(b_original_targets)

            b_predictions = torch.argmax(F.softmax(b_outputs.logits, dim=1), dim=1)
            b_predictions = b_predictions.cpu().detach().numpy().tolist()
            b_predictions = [dataset.index2label[label] for label in b_predictions]
            predicted_labels.extend(b_predictions)

            for i, premise in enumerate(b_premises):
                output_predictions.append((
                    premise,
                    batch['hypothesis'][i],
                    b_original_targets[i],
                    b_predictions[i]
                ))

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(data_loader)
        print("average loss:", avg_train_loss)
        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(premise_hypothesis))

        # evaluate
        print("Test Accuracy: {}".format(accuracy_score(golden_labels, predicted_labels)))
        print("Test Precision: {}".format(precision_score(golden_labels, predicted_labels, average="weighted")))
        print("Test Recall: {}".format(recall_score(golden_labels, predicted_labels, average="weighted")))
        print("Test F1-Score(weighted average): {}".format(
            f1_score(golden_labels, predicted_labels, average="weighted")))
        print("Test classification Report:\n{}".format(classification_report(
            golden_labels, predicted_labels, digits=10)))
        return output_predictions

    def evaluation_2(self, premises, hypotheses, labels, device, max_length, batch_size=4):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return
        label_count = {label: labels.count(label) for label in labels}
        print("label_count:", label_count)
        dataset = TextualEntailmentDataset2(premises=premises, hypotheses=hypotheses, targets=labels,
                                            label_list=self.label_list, tokenizer=self.tokenizer,
                                            model_type=self.model_type, max_length=max_length)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print(f'#premises:{len(premises)}, #hypotheses:{len(hypotheses)}, #labels:{len(labels)}')
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        total_loss, total_time = 0, 0
        output_predictions = []
        golden_labels, predicted_labels = [], []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_premises = batch['premise']
            b_input_ids = batch['input_ids']
            b_attention_mask = batch['attention_mask']
            b_token_type_ids = batch['token_type_ids']
            b_targets = batch['targets']

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)
            b_token_type_ids = b_token_type_ids.to(device)
            b_targets = b_targets.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                b_outputs = self.model(input_ids=b_input_ids, attention_mask=b_attention_mask,
                                       token_type_ids=b_token_type_ids, labels=b_targets)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')
            # get the loss
            total_loss += b_outputs.loss.item()

            b_original_targets = batch['original_targets']
            golden_labels.extend(b_original_targets)

            b_predictions = torch.argmax(F.softmax(b_outputs.logits, dim=1), dim=1)
            b_predictions = b_predictions.cpu().detach().numpy().tolist()
            b_predictions = [dataset.index2label[label] for label in b_predictions]
            predicted_labels.extend(b_predictions)

            for i, premise in enumerate(b_premises):
                output_predictions.append((
                    premise,
                    batch['hypothesis'][i],
                    b_original_targets[i],
                    b_predictions[i]
                ))

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(data_loader)
        print("average loss:", avg_train_loss)
        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(premises))

        # evaluate
        print("Test Accuracy: {}".format(accuracy_score(golden_labels, predicted_labels)))
        print("Test Precision: {}".format(precision_score(golden_labels, predicted_labels, average="weighted")))
        print("Test Recall: {}".format(recall_score(golden_labels, predicted_labels, average="weighted")))
        print("Test F1-Score(weighted average): {}".format(
            f1_score(golden_labels, predicted_labels, average="weighted")))
        print("Test classification Report:\n{}".format(classification_report(
            golden_labels, predicted_labels, digits=10)))
        return output_predictions

    def mt5_evaluation(self, premise_hypothesis, labels, device, max_length, batch_size=4):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return
        if len(premise_hypothesis) != len(labels):
            print('length of inputs and labels is not equal!!')
            return

        dataset = TextualEntailmentDataset(premises_hypotheses=premise_hypothesis, targets=labels,
                                           label_list=self.label_list, tokenizer=self.tokenizer,
                                           model_type=self.model_type, max_length=max_length)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print(f'#premise_hypothesis:{len(premise_hypothesis)}, #labels:{len(labels)}')
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        total_time = 0
        output_predictions = []
        golden_labels, predicted_labels = [], []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_premises = batch['premise']
            b_input_ids = batch['input_ids']
            b_attention_mask = batch['attention_mask']

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                b_outputs = self.model.generate(input_ids=b_input_ids, attention_mask=b_attention_mask)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')

            b_original_targets = batch['original_targets']
            golden_labels.extend(b_original_targets)

            b_predictions = self.tokenizer.batch_decode(b_outputs, skip_special_tokens=True)
            predicted_labels.extend(b_predictions)

            for i, premise in enumerate(b_premises):
                output_predictions.append((
                    premise,
                    batch['hypothesis'][i],
                    b_original_targets[i],
                    b_predictions[i]
                ))

        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(premise_hypothesis))

        # evaluate
        print("Test Accuracy: {}".format(accuracy_score(golden_labels, predicted_labels)))
        print("Test Precision: {}".format(precision_score(golden_labels, predicted_labels, average="weighted")))
        print("Test Recall: {}".format(recall_score(golden_labels, predicted_labels, average="weighted")))
        print("Test F1-Score(weighted average): {}".format(
            f1_score(golden_labels, predicted_labels, average="weighted")))
        print("Test classification Report:\n{}".format(
            classification_report(golden_labels, predicted_labels, digits=10)))
        return output_predictions

    def evaluation_pair_similarity(self, premises, hypotheses, labels, device, label_map=None, batch_size=4):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return
        if label_map is None:
            label_map = {"0<=score<0.4": "contradiction", "0.4<=score<=0.6": "neutral",
                         "0.6<score<=1": "entailment"}
            print(f"Setting default value for label map: {label_map}")
        elif not all('score' in cond for cond in label_map.keys()):
            print(f"All the key of label_map must contain a condition on `score` variable.\n"
                  f"For example: label_map ={label_map}")
            return

        label_count = {label: labels.count(label) for label in labels}
        print("label_count:", label_count)
        print(f'#premises:{len(premises)}, #hypotheses:{len(hypotheses)}, #labels:{len(labels)}')

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        total_time = 0
        print("Start to evaluate test data ...")

        # Compute the sentence embeddings
        start = time.monotonic()
        premises_embeddings = self.model.encode(premises, convert_to_tensor=True, show_progress_bar=True,
                                                batch_size=batch_size)
        hypotheses_embeddings = self.model.encode(hypotheses, convert_to_tensor=True, show_progress_bar=True,
                                                  batch_size=batch_size)
        end = time.monotonic()
        print(f'time for computing embeddings of premises and hypotheses: {end - start}')
        total_time += end - start

        dataset = TextualEntailmentSimilarityDataset(premises=premises_embeddings, hypotheses=hypotheses_embeddings,
                                                     targets=labels)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print("#batch:", len(data_loader))

        output_predictions = []
        golden_labels, predicted_labels = [], []
        for step, batch in enumerate(data_loader):
            b_premises = batch['premise']
            b_hypotheses = batch['hypothesis']

            # move tensors to GPU if CUDA is available
            b_premises = b_premises.to(device)
            b_hypotheses = b_hypotheses.to(device)

            # Compute the pair-wise cosine similarities
            start = time.monotonic()
            cos_similarity_scores, b_predictions = [], []
            for i in range(len(b_premises)):
                cosine_score = util.pytorch_cos_sim(b_premises[i], b_hypotheses[i]).cpu().detach().numpy()[0][0]
                cos_similarity_scores.append(cosine_score)
                predicted_label = ''
                for exp in label_map:
                    if eval(exp.replace('score', str(cosine_score))):
                        predicted_label = label_map[exp]
                        break
                b_predictions.append(predicted_label)

            end = time.monotonic()
            total_time += end - start
            print(f'time for calculating cosine similarity in step {step}: {end - start}')

            golden_labels.extend(batch['targets'])
            predicted_labels.extend(b_predictions)

            for i, item in enumerate(batch['item']):
                output_predictions.append((
                    premises[item],
                    hypotheses[item],
                    cos_similarity_scores[i],
                    batch['targets'][i],
                    b_predictions[i]
                ))

        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(premises))

        # evaluate
        print("Test Accuracy: {}".format(accuracy_score(golden_labels, predicted_labels)))
        print("Test Precision: {}".format(precision_score(golden_labels, predicted_labels, average="weighted")))
        print("Test Recall: {}".format(recall_score(golden_labels, predicted_labels, average="weighted")))
        print("Test F1-Score(weighted average): {}".format(
            f1_score(golden_labels, predicted_labels, average="weighted")))
        print("Test classification Report:\n{}".format(classification_report(
            golden_labels, predicted_labels, digits=10)))
        return output_predictions


In [6]:
model_name='m3hrdadfi/bert-fa-base-uncased-farstail-mean-tokens'
te_model = TextualEntailment(model_name=model_name, model_type="sentence-transformer", label_list = ['contradiction', 'neutral', 'entailment'])
print(te_model.config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=519.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=651450094.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1198122.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267.0, style=ProgressStyle(description_…


BertConfig {
  "_name_or_path": "HooshvareLab/bert-fa-base-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.7.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



## Sample Inference

In [7]:
premise_list = [
    "این مسابقات بین آوریل و دسامبر در هیپودروم ولیفندی در نزدیکی باکرکی ، ۱۵ کیلومتری (۹ مایل) غرب استانبول برگزار می شود.",
    "آیا کودکانی وجود دارند که نیاز به سرگرمی دارند؟",
    "ما به سفرهایی رفته ایم که در نهرهایی شنا کرده ایم"
]
hypothesis_list = [
    "در ولیفندی هیپودروم، مسابقاتی از آوریل تا دسامبر وجود دارد.",
    "هیچ کودکی هرگز نمی خواهد سرگرم شود.",
    "علاوه بر استحمام در نهرها ، ما به اسپا ها و سونا ها نیز رفته ایم."
]
te_model.sentence_transformer_textual_entailment_inference(
    premise_list, hypothesis_list, device, 
    label_map = {"0<=score<0.4": "contradiction", "0.4<=score<=0.6": "neutral", 
                 "0.6<score<=1": "entailment"}
)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




[('این مسابقات بین آوریل و دسامبر در هیپودروم ولیفندی در نزدیکی باکرکی ، ۱۵ کیلومتری (۹ مایل) غرب استانبول برگزار می شود.',
  'در ولیفندی هیپودروم، مسابقاتی از آوریل تا دسامبر وجود دارد.',
  0.7888473,
  'entailment'),
 ('آیا کودکانی وجود دارند که نیاز به سرگرمی دارند؟',
  'هیچ کودکی هرگز نمی خواهد سرگرم شود.',
  0.6563661,
  'entailment'),
 ('ما به سفرهایی رفته ایم که در نهرهایی شنا کرده ایم',
  'علاوه بر استحمام در نهرها ، ما به اسپا ها و سونا ها نیز رفته ایم.',
  0.8605703,
  'entailment')]

## ParsiNLU Dataset

In [8]:
!git clone https://github.com/persiannlp/parsinlu
!ls parsinlu
!ls parsinlu/data/entailment/merged_with_farstail

Cloning into 'parsinlu'...
remote: Enumerating objects: 1434, done.[K
remote: Counting objects: 100% (182/182), done.[K
remote: Compressing objects: 100% (98/98), done.[K
remote: Total 1434 (delta 110), reused 139 (delta 82), pack-reused 1252[K
Receiving objects: 100% (1434/1434), 27.81 MiB | 16.59 MiB/s, done.
Resolving deltas: 100% (913/913), done.
data  LICENSE  README.md  requirements.txt  scripts  src
dev.tsv  test_farstail.tsv  test_natural.tsv  test_translation.tsv  train.tsv


### parsinlu natural subset

In [9]:
test_natural, test_labels = te_model.load_dataset_test_file(
    dataset_name="parsinlu-natural", 
    dataset_file="./parsinlu/data/entailment/merged_with_farstail/test_natural.tsv",
    label_map={"c": "contradiction", "n": "neutral", "e": "entailment"}
    )
test_natural_premises, test_natural_hypotheses = [ph.split('<sep>')[0] for ph in test_natural], [ph.split('<sep>')[1] for ph in test_natural]
print(len(test_natural_premises), len(test_natural_hypotheses))

test part:
 #premise_hypothesis: 850, #label: 850
850 850


In [10]:
!nvidia-smi
!lscpu

Wed Aug  4 13:52:39 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    73W / 149W |   1133MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [11]:
evaluation_output = te_model.evaluation_pair_similarity(
    test_natural_premises, test_natural_hypotheses, test_labels, device,
    label_map = {"0<=score<0.4": "contradiction", "0.4<=score<=0.6": "neutral", "0.6<score<=1": "entailment"}, 
    batch_size=128
)

label_count: {'contradiction': 257, 'neutral': 274, 'entailment': 319}
#premises:850, #hypotheses:850, #labels:850
Start to evaluate test data ...


HBox(children=(FloatProgress(value=0.0, description='Batches', max=7.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=7.0, style=ProgressStyle(description_width=…


time for computing embeddings of premises and hypotheses: 7.521014508999997
#batch: 7
time for calculating cosine similarity in step 0: 0.15708214000005682
time for calculating cosine similarity in step 1: 0.04139429399992878
time for calculating cosine similarity in step 2: 0.03812994299994443
time for calculating cosine similarity in step 3: 0.038955784999984644
time for calculating cosine similarity in step 4: 0.038914477000048464
time for calculating cosine similarity in step 5: 0.038100299000007
time for calculating cosine similarity in step 6: 0.024460723999936818
total inference time: 7.898052170999904
total inference time / #samples: 0.0092918260835293
Test Accuracy: 0.4082352941176471
Test Precision: 0.36308743929226367
Test Recall: 0.4082352941176471
Test F1-Score(weighted average): 0.318604416176877
Test classification Report:
               precision    recall  f1-score   support

contradiction  0.1904761905 0.0155642023 0.0287769784       257
   entailment  0.4000000000 0

In [12]:
for premise, hypothesis, similarity_score, true_label, predicted_label in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}\t{}'.format(premise, hypothesis, similarity_score, true_label, predicted_label))

او را نیز بکشتند و پس از او، هیچ‌کس را نیافتند که شایسته پادشاهی باشد.	او را نیز بکشتند و پس از او پسرش را شایسته ترین فرد برای پادشاهی یافتند.	0.9187102317810059	contradiction	entailment
نعمت‌زاده همچنین با اشاره به ارتقاء کیفیت خودروهای داخلی طی چند سال گذشته، گفت: با اقدامات صورت گرفته کیفیت خودروهای داخلی رو به بهبود است، اما تا وضع مطلوب هنوز فاصله داریم.	به گفته‌ی وی تولید خودرو در مقایسه با سال‌های گذشته افزایش یافته است.	0.8101886510848999	neutral	entailment
صابر دین‌پژوه‌ رتبه اول علوم ریاضی و فنی کنکور سال 96 که از شهر تبریز حائز این رتبه شده بود در مورد موفقیت خود اظهار کرد: پیش از اعلام نتایج و با احتساب درصد‌های کنکور حدس می‌زدم که یکی از رتبه‌های برتر و تک رقمی کنکور را کسب کنم زیرا در کنکورهای‌ آزمایشی در طول سال تحصیلی همواره‌ رتبه‌های تک رقمی و برتر را کسب می‌کردم.	رتبه‌ی اول علوم ریاضی و فنی کنکور از نتیجه‌ی کنکور خود متعجب بود. 	0.7171745896339417	contradiction	entailment
وی پس از رسیدن به کامپیوتر اصلی وارد ماتریکس شده و نبرد سختی بین او و اسمیت رخ می‌دهد.	در این فی

In [13]:
output_file_name = "textual_entailment-parsinlu-natural_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for premise, hypothesis, similarity_score, true_label, predicted_label in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\t{}\n'.format(premise, hypothesis, similarity_score, true_label, predicted_label))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### parsinlu mnli subset

In [14]:
test_mnli, test_labels = te_model.load_dataset_test_file(
    dataset_name="parsinlu-mnli", 
    dataset_file="./parsinlu/data/entailment/merged_with_farstail/test_translation.tsv",
    label_map={"c": "contradiction", "n": "neutral", "e": "entailment"}
    )
test_mnli_premises, test_mnli_hypotheses = [ph.split('<sep>')[0] for ph in test_mnli], [ph.split('<sep>')[1] for ph in test_mnli]
print(len(test_mnli_premises), len(test_mnli_hypotheses))

test part:
 #premise_hypothesis: 823, #label: 823
823 823


In [15]:
!nvidia-smi
!lscpu

Wed Aug  4 13:53:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    72W / 149W |   1863MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [16]:
evaluation_output = te_model.evaluation_pair_similarity(
    test_mnli_premises, test_mnli_hypotheses, test_labels, device,
    label_map = {"0<=score<0.4": "contradiction", "0.4<=score<=0.6": "neutral", "0.6<score<=1": "entailment"}, 
    batch_size=128
)

label_count: {'neutral': 228, 'entailment': 291, 'contradiction': 304}
#premises:823, #hypotheses:823, #labels:823
Start to evaluate test data ...


HBox(children=(FloatProgress(value=0.0, description='Batches', max=7.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=7.0, style=ProgressStyle(description_width=…


time for computing embeddings of premises and hypotheses: 7.100854757999969
#batch: 7
time for calculating cosine similarity in step 0: 0.09465016100000412
time for calculating cosine similarity in step 1: 0.04304189599997699
time for calculating cosine similarity in step 2: 0.039554578999968726
time for calculating cosine similarity in step 3: 0.04279513400001633
time for calculating cosine similarity in step 4: 0.03818522999995366
time for calculating cosine similarity in step 5: 0.03897138599995742
time for calculating cosine similarity in step 6: 0.016437776999964626
total inference time: 7.41449092099981
total inference time / #samples: 0.009009101969623099
Test Accuracy: 0.37424058323207776
Test Precision: 0.35177668933282014
Test Recall: 0.37424058323207776
Test F1-Score(weighted average): 0.2643068268883787
Test classification Report:
               precision    recall  f1-score   support

contradiction  0.3333333333 0.0164473684 0.0313479624       304
   entailment  0.3809523

In [17]:
for premise, hypothesis, similarity_score, true_label, predicted_label in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}\t{}'.format(premise, hypothesis, similarity_score, true_label, predicted_label))

آنها به عنوان (الف) رسید جبران خسارت طبقه بندی می شوند (یعنی	چیز خوبی است که آنها در این دسته قرار می گیرند.	0.5867058634757996	neutral	neutral
مورد دیگری که باید ببینید ، فیلم خوابیدن با دشمن است.	باید خوابیدن با دشمن را ببینی.	0.48379793763160706	entailment	neutral
داستانهای افسانه ای ارزش خود را دارند ، اما پنهان کاری کثیف است.	پنهان کردن چیزها فقط کثیف است ، در حالی که در داستان افسانه ای، جلال وجود دارد	0.528892457485199	entailment	neutral
او یک دستش را به سمت سمندر آورد ، آن را به آرامی نوازش کرد و آن را دوباره روی سینه دیو گذاشت.	سمندر پذیرای حرکات دوستانه او بود.	0.6463320255279541	neutral	entailment
بنابراین ، با دقت بسیار زیاد ، افشاگری نهایی فیلم Zapruder این است که ابراهیم Zapruder خود توطئه گر بود.	در فیلم Zapruder فاش می شود که او توطئه گر بود ، این برای عموم شوک آور بود.	0.8478922247886658	neutral	entailment
ایده خوبی هست	بهترین ایده ای بود که من شنیده ام.	0.44334161281585693	neutral	neutral
جان ، در حالی که هنوز هم به چشمان آدرین نگاه می کرد ، دید که یک تیغه بسیار تیز ق

In [18]:
output_file_name = "textual_entailment-parsinlu-mnli_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for premise, hypothesis, similarity_score, true_label, predicted_label in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\t{}\n'.format(premise, hypothesis, similarity_score, true_label, predicted_label))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### parsinlu farstail subset

In [19]:
test_farstail, test_labels = te_model.load_dataset_test_file(
    dataset_name="parsinlu-farstail", 
    dataset_file="./parsinlu/data/entailment/merged_with_farstail/test_farstail.tsv",
    label_map={"c": "contradiction", "n": "neutral", "e": "entailment"}
    )
test_farstail_premises, test_farstail_hypotheses = [ph.split('<sep>')[0] for ph in test_farstail], [ph.split('<sep>')[1] for ph in test_farstail]
print(len(test_farstail_premises), len(test_farstail_hypotheses))

test part:
 #premise_hypothesis: 1564, #label: 1564
1564 1564


In [20]:
!nvidia-smi
!lscpu

Wed Aug  4 13:53:16 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   74C    P0    77W / 149W |   2351MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [21]:
evaluation_output = te_model.evaluation_pair_similarity(
    test_farstail_premises, test_farstail_hypotheses, test_labels, device,
    label_map = {"0<=score<0.4": "contradiction", "0.4<=score<=0.6": "neutral", "0.6<score<=1": "entailment"}, 
    batch_size=128
)

label_count: {'contradiction': 510, 'neutral': 535, 'entailment': 519}
#premises:1564, #hypotheses:1564, #labels:1564
Start to evaluate test data ...


HBox(children=(FloatProgress(value=0.0, description='Batches', max=13.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=13.0, style=ProgressStyle(description_width…


time for computing embeddings of premises and hypotheses: 14.389207769999985
#batch: 13
time for calculating cosine similarity in step 0: 0.0827750340000648
time for calculating cosine similarity in step 1: 0.03956600999993043
time for calculating cosine similarity in step 2: 0.039392539000004945
time for calculating cosine similarity in step 3: 0.043558637000046474
time for calculating cosine similarity in step 4: 0.039447646999974495
time for calculating cosine similarity in step 5: 0.04363478799996301
time for calculating cosine similarity in step 6: 0.04005284800007303
time for calculating cosine similarity in step 7: 0.03888273999996272
time for calculating cosine similarity in step 8: 0.04243731100007153
time for calculating cosine similarity in step 9: 0.040579937999950744
time for calculating cosine similarity in step 10: 0.04108679500006929
time for calculating cosine similarity in step 11: 0.040462133999994876
time for calculating cosine similarity in step 12: 0.008280935000

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
for premise, hypothesis, similarity_score, true_label, predicted_label in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}\t{}'.format(premise, hypothesis, similarity_score, true_label, predicted_label))

دوران امامت امام صادق علیه السلام، مصادف است با ترجمه آثار یونانی و گسترش مبارزات فکری و ایدئولوژیکی و نیز ظهور مذاهب و مکتب های انحرافی.	امام سجاد (ع) در دورانی امامت کردند که همزمان با ترجمه آثار یونانی، ظهور مذاهب و مکتب های انحرافی بود.	0.8055014610290527	contradiction	entailment
دوران امامت امام صادق علیه السلام، مصادف است با ترجمه آثار یونانی و گسترش مبارزات فکری و ایدئولوژیکی و نیز ظهور مذاهب و مکتب های انحرافی.	دستگاه فاسد حکومتی با صرف هزینه های هنگفت، سعی در جعل احادیث و ایجاد انحراف در مکتب تشیع کرده است.	0.6192481517791748	neutral	entailment
با شهادت امام رضا(ع) مرحله جدیدی از تلاش ائمه آغاز شد که «دوران محنت اهل بیت» نام دارد.	دوران محنت اهل بیت پس از شهادت امام رضا(ع) آغاز گردید.	0.7567191123962402	entailment	entailment
با شهادت امام رضا(ع) مرحله جدیدی از تلاش ائمه آغاز شد که «دوران محنت اهل بیت» نام دارد.	بعد از به شهادت رسیدن امام هادی(ع) دوران محنت اهل بیت شروع شد.	0.860140860080719	contradiction	entailment
با شهادت امام رضا(ع) مرحله جدیدی از تلاش ائمه آغاز شد که «دورا

In [23]:
output_file_name = "textual_entailment-parsinlu-farstail_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for premise, hypothesis, similarity_score, true_label, predicted_label in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\t{}\n'.format(premise, hypothesis, similarity_score, true_label, predicted_label))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

## FarsTail dataset

In [24]:
!git clone https://github.com/dml-qom/FarsTail
!ls FarsTail
!ls FarsTail/data/

Cloning into 'FarsTail'...
remote: Enumerating objects: 125, done.[K
remote: Counting objects: 100% (125/125), done.[K
remote: Compressing objects: 100% (115/115), done.[K
remote: Total 125 (delta 34), reused 18 (delta 4), pack-reused 0[K
Receiving objects: 100% (125/125), 43.46 MiB | 22.14 MiB/s, done.
Resolving deltas: 100% (34/34), done.
data  farstail.png  LICENSE  README.md
Indexed-FarsTail.npz  Test-word.csv  Train-word.csv  Val-word.csv


In [25]:
test_farstail_premises, test_farstail_hypotheses, test_labels = te_model.load_dataset_test_file(
    dataset_name="farstail", 
    dataset_file="./FarsTail/data/Test-word.csv",
    label_map={"c": "contradiction", "n": "neutral", "e": "entailment"}
    )

test part:
 #premise: 1564, #hypothesis: 1564, #label: 1564


In [26]:
!nvidia-smi
!lscpu

Wed Aug  4 13:53:39 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    72W / 149W |   2007MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [27]:
evaluation_output = te_model.evaluation_pair_similarity(
    test_farstail_premises, test_farstail_hypotheses, test_labels, device,
    label_map = {"0<=score<0.4": "contradiction", "0.4<=score<=0.6": "neutral", "0.6<score<=1": "entailment"}, 
    batch_size=128
)

label_count: {'contradiction': 510, 'neutral': 535, 'entailment': 519}
#premises:1564, #hypotheses:1564, #labels:1564
Start to evaluate test data ...


HBox(children=(FloatProgress(value=0.0, description='Batches', max=13.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=13.0, style=ProgressStyle(description_width…


time for computing embeddings of premises and hypotheses: 14.402881769999908
#batch: 13
time for calculating cosine similarity in step 0: 0.07884484500004874
time for calculating cosine similarity in step 1: 0.03837863700005073
time for calculating cosine similarity in step 2: 0.03945958799999971
time for calculating cosine similarity in step 3: 0.0485020449999638
time for calculating cosine similarity in step 4: 0.04017391000002135
time for calculating cosine similarity in step 5: 0.03934771499996259
time for calculating cosine similarity in step 6: 0.04004948399995101
time for calculating cosine similarity in step 7: 0.03863102600007551
time for calculating cosine similarity in step 8: 0.043066573000032804
time for calculating cosine similarity in step 9: 0.04147529600004418
time for calculating cosine similarity in step 10: 0.04187504399999398
time for calculating cosine similarity in step 11: 0.03914293599996199
time for calculating cosine similarity in step 12: 0.0086457969999855

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
for premise, hypothesis, similarity_score, true_label, predicted_label in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}\t{}'.format(premise, hypothesis, similarity_score, true_label, predicted_label))

دوران امامت امام صادق علیه السلام، مصادف است با ترجمه آثار یونانی و گسترش مبارزات فکری و ایدئولوژیکی و نیز ظهور مذاهب و مکتب های انحرافی.	امام سجاد (ع) در دورانی امامت کردند که همزمان با ترجمه آثار یونانی، ظهور مذاهب و مکتب های انحرافی بود.	0.8055014610290527	contradiction	entailment
دوران امامت امام صادق علیه السلام، مصادف است با ترجمه آثار یونانی و گسترش مبارزات فکری و ایدئولوژیکی و نیز ظهور مذاهب و مکتب های انحرافی.	دستگاه فاسد حکومتی با صرف هزینه های هنگفت، سعی در جعل احادیث و ایجاد انحراف در مکتب تشیع کرده است.	0.6192481517791748	neutral	entailment
با شهادت امام رضا(ع) مرحله جدیدی از تلاش ائمه آغاز شد که «دوران محنت اهل بیت» نام دارد.	دوران محنت اهل بیت پس از شهادت امام رضا(ع) آغاز گردید.	0.7567191123962402	entailment	entailment
با شهادت امام رضا(ع) مرحله جدیدی از تلاش ائمه آغاز شد که «دوران محنت اهل بیت» نام دارد.	بعد از به شهادت رسیدن امام هادی(ع) دوران محنت اهل بیت شروع شد.	0.860140860080719	contradiction	entailment
با شهادت امام رضا(ع) مرحله جدیدی از تلاش ائمه آغاز شد که «دورا

In [29]:
output_file_name = "textual_entailment-farstail_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for premise, hypothesis, similarity_score, true_label, predicted_label in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\t{}\n'.format(premise, hypothesis, similarity_score, true_label, predicted_label))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()