# Textual Entailment

In [1]:
!nvidia-smi
!lscpu

Wed Aug  4 08:06:25 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install hazm==0.7.0
!pip install seqeval==1.2.2
!pip install sentencepiece==0.1.96
!pip install transformers==4.7.0
!pip install clean-text[gpl]==0.4.0
!pip install sacrebleu==1.5.1

Collecting hazm==0.7.0
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[K     |████████████████████████████████| 316 kB 8.6 MB/s 
[?25hCollecting nltk==3.3
  Downloading nltk-3.3.0.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 59.0 MB/s 
[?25hCollecting libwapiti>=0.2.1
  Downloading libwapiti-0.2.1.tar.gz (233 kB)
[K     |████████████████████████████████| 233 kB 47.5 MB/s 
Building wheels for collected packages: nltk, libwapiti
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-py3-none-any.whl size=1394486 sha256=3ea3c1af2905f1c2c54083a7e08f7fa98bc34414b2ea692038fd77d2638675c3
  Stored in directory: /root/.cache/pip/wheels/9b/fd/0c/d92302c876e5de87ebd7fc0979d82edb93e2d8d768bf71fac4
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Created wheel for libwapiti: filename=libwapiti-0.2.1-cp37-cp37m-linux_x86_64.whl size=154224 sha256=ab75e16f0d10831404ad7cc1eb0d903bbd2679ac9e76f89cb6efbc2501718ca8
 

In [3]:
!pip install PyDrive
import os
import IPython.display as ipd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [4]:
# Import required packages
import os
import gc
import re
import hazm
import time
import json
import sacrebleu
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import transformers
from transformers import AutoConfig, AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import MT5Config, MT5ForConditionalGeneration, MT5Tokenizer
from transformers.data.processors.squad import SquadV2Processor
from transformers.data.metrics.squad_metrics import normalize_answer, compute_exact, compute_f1, merge_eval, \
    make_eval_dict, apply_no_ans_threshold, find_all_best_thresh, squad_evaluate

from cleantext import clean

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print()
print('numpy', np.__version__)
print('pandas', pd.__version__)
print('transformers', transformers.__version__)
print('torch', torch.__version__)
print()

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")



numpy 1.19.5
pandas 1.1.5
transformers 4.7.0
torch 1.9.0+cu102

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [5]:
class TextualEntailmentDataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Textual Entailment. """

    def __init__(self, premises_hypotheses, targets, label_list, tokenizer, model_type, max_length):
        self.premises_hypotheses = premises_hypotheses
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.targets = targets
        self.max_length = max_length
        self.label2index = {label: i for i, label in enumerate(label_list)} if isinstance(label_list, list) else {}
        self.index2label = {i: label for label, i in self.label2index.items()}

    def __len__(self):
        return len(self.premises_hypotheses)

    def __getitem__(self, item):
        encoding = self.tokenizer(
            self.premises_hypotheses[item],
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        if self.model_type == "mt5":
            inputs = {
                'premise': self.premises_hypotheses[item].split('<sep>')[0],
                'hypothesis': self.premises_hypotheses[item].split('<sep>')[1],
                'pair': self.premises_hypotheses[item],
                'targets': torch.tensor(self.label2index[self.targets[item]], dtype=torch.long),
                'original_targets': self.targets[item],
                'input_ids': encoding.input_ids.flatten(),
                'attention_mask': encoding.attention_mask.flatten()
            }
        else:
            inputs = {
                'premise': self.premises_hypotheses[item].split('<sep>')[0],
                'hypothesis': self.premises_hypotheses[item].split('<sep>')[1],
                'pair': self.premises_hypotheses[item],
                'targets': torch.tensor(self.label2index[self.targets[item]], dtype=torch.long),
                'original_targets': self.targets[item],
                'input_ids': encoding.input_ids.flatten(),
                'attention_mask': encoding.attention_mask.flatten(),
                'token_type_ids': encoding['token_type_ids'].flatten()
            }
        return inputs


class TextualEntailmentDataset2(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Textual Entailment. """

    def __init__(self, premises, hypotheses, targets, label_list, tokenizer, model_type, max_length):
        self.premises = premises
        self.hypotheses = hypotheses
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.targets = targets
        self.max_length = max_length
        self.label2index = {label: i for i, label in enumerate(label_list)} if isinstance(label_list, list) else {}
        self.index2label = {i: label for label, i in self.label2index.items()}

    def __len__(self):
        return len(self.premises)

    def __getitem__(self, item):
        encoding = self.tokenizer(
            [(self.premises[item], self.hypotheses[item])],
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        if self.model_type == "mt5":
            inputs = {
                'premise': self.premises[item],
                'hypothesis': self.hypotheses[item],
                'targets': torch.tensor(self.label2index[self.targets[item]], dtype=torch.long),
                'original_targets': self.targets[item],
                'input_ids': encoding.input_ids.flatten(),
                'attention_mask': encoding.attention_mask.flatten()
            }
        else:
            inputs = {
                'premise': self.premises[item],
                'hypothesis': self.hypotheses[item],
                'targets': torch.tensor(self.label2index[self.targets[item]], dtype=torch.long),
                'original_targets': self.targets[item],
                'input_ids': encoding.input_ids.flatten(),
                'attention_mask': encoding.attention_mask.flatten(),
                'token_type_ids': encoding['token_type_ids'].flatten()
            }
        return inputs


class TextualEntailmentSimilarityDataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Textual Entailment. """

    def __init__(self, premises, hypotheses, targets):
        self.premises = premises
        self.hypotheses = hypotheses
        self.targets = targets

    def __len__(self):
        return len(self.premises)

    def __getitem__(self, item):
        inputs = {
            'item': item,
            'premise': self.premises[item],
            'hypothesis': self.hypotheses[item],
            'targets': self.targets[item]
        }
        return inputs


class TextualEntailment:
    def __init__(self, model_name, model_type, label_list):
        self.normalizer = hazm.Normalizer()
        self.model_name = model_name
        self.model_type = model_type.lower()
        self.label_list = label_list
        if self.model_type == "mt5":
            self.tokenizer = MT5Tokenizer.from_pretrained(model_name)
            self.model = MT5ForConditionalGeneration.from_pretrained(model_name)
            self.config = MT5Config.from_pretrained(self.model_name)
        elif self.model_type == "sentence-transformer":
            word_embedding_model = models.Transformer(model_name)
            pooling_model = models.Pooling(
                word_embedding_model.get_word_embedding_dimension(),
                pooling_mode_mean_tokens=True,
                pooling_mode_cls_token=False,
                pooling_mode_max_tokens=False
            )
            self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.config = AutoConfig.from_pretrained(self.model_name)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
            self.config = AutoConfig.from_pretrained(self.model_name)
            self.id2label = self.config.id2label
            self.label2id = self.config.label2id

    def load_dataset_test_file(self, dataset_name, dataset_file, **kwargs):
        if dataset_name.lower() in ["parsinlu-natural", "parsinlu-mnli", "parsinlu-farstail"]:
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            data = pd.read_csv(dataset_file, delimiter="\t", names=['premise_hypothesis', 'label'], header=None)

            # cleaning labels
            valid_labels = ['e', 'n', 'c']
            data['label'] = data['label'].apply(lambda r: r if r in valid_labels else None)
            data = data.dropna(subset=['label'])
            data = data.reset_index(drop=True)

            if 'label_map' in kwargs:
                data['label'] = data['label'].apply(lambda l: kwargs['label_map'][l])

            premise_hypothesis, labels = data['premise_hypothesis'].values.tolist(), data['label'].values.tolist()
            print(f'test part:\n #premise_hypothesis: {len(premise_hypothesis)}, #label: {len(labels)}')
            return premise_hypothesis, labels
        if dataset_name.lower() == "farstail":
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            data = pd.read_csv(dataset_file, sep='\t')

            if 'label_map' in kwargs:
                data['label'] = data['label'].apply(lambda l: kwargs['label_map'][l])

            premises = data['premise'].values.tolist()
            hypotheses = data['hypothesis'].values.tolist()
            labels = data['label'].values.tolist()
            print(f'test part:\n #premise: {len(premises)}, #hypothesis: {len(hypotheses)}, #label: {len(labels)}')
            return premises, hypotheses, labels

    def textual_entailment_inference(self, premises, hypotheses, device, max_length):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        new_input = []
        for p, h in zip(premises, hypotheses):
            new_input.append((p, h))

        tokenized_batch = self.tokenizer(
            new_input,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        tokenized_batch = tokenized_batch.to(device)
        outputs = self.model(**tokenized_batch)
        pt_predictions = torch.argmax(F.softmax(outputs.logits, dim=1), dim=1)

        output_predictions = []
        for i, premise in enumerate(premises):
            output_predictions.append(
                (premise, hypotheses[i], pt_predictions[i].item(), self.label_list[pt_predictions[i].item()])
            )
        return output_predictions

    def mt5_textual_entailment_inference(self, premises, hypotheses, device):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return

        new_input = []
        for p, h in zip(premises, hypotheses):
            new_input.append(p + "<sep>" + h)

        tokenized_batch = self.tokenizer(
            new_input,
            padding=True,
            return_tensors="pt"
        )

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        input_ids = tokenized_batch.input_ids.to(device)
        attention_mask = tokenized_batch.attention_mask.to(device)
        outputs = self.model.generate(input_ids=input_ids,
                                      attention_mask=attention_mask)
        predictions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return predictions

    def sentence_transformer_textual_entailment_inference(self, premises, hypotheses, device, label_map=None):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return
        if label_map is None:
            label_map = {"0<=score<0.4": "contradiction", "0.4<=score<=0.6": "neutral", "0.6<score<=1": "entailment"}
            print(f"Setting default value for label map: {label_map}")
        elif not all('score' in cond for cond in label_map.keys()):
            print(f"All the key of label_map must contain a condition on `score` variable.\n"
                  f"For example: label_map ={label_map}")
            return

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        premises_embeddings = self.model.encode(premises, convert_to_tensor=True, show_progress_bar=True)
        hypotheses_embeddings = self.model.encode(hypotheses, convert_to_tensor=True, show_progress_bar=True)

        # Compute the pair-wise cosine similarities
        similarity_scores, predicted_labels = [], []
        for i in range(len(premises)):
            cosine_score = \
                util.pytorch_cos_sim(premises_embeddings[i], hypotheses_embeddings[i]).cpu().detach().numpy()[0][0]
            similarity_scores.append(cosine_score)
            predicted_label = ''
            for exp in label_map:
                if eval(exp.replace('score', str(cosine_score))):
                    predicted_label = label_map[exp]
                    break
            predicted_labels.append(predicted_label)

        output_predictions = []
        for i, premise in enumerate(premises):
            output_predictions.append(
                (premise, hypotheses[i], similarity_scores[i], predicted_labels[i])
            )
        return output_predictions

    def evaluation(self, premise_hypothesis, labels, device, max_length, batch_size=4):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return
        label_count = {label: labels.count(label) for label in labels}
        print("label_count:", label_count)
        dataset = TextualEntailmentDataset(premises_hypotheses=premise_hypothesis, targets=labels,
                                           label_list=self.label_list, tokenizer=self.tokenizer,
                                           model_type=self.model_type, max_length=max_length)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print(f'#premise_hypothesis:{len(premise_hypothesis)}, #labels:{len(labels)}')
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        total_loss, total_time = 0, 0
        output_predictions = []
        golden_labels, predicted_labels = [], []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_premises = batch['premise']
            b_input_ids = batch['input_ids']
            b_attention_mask = batch['attention_mask']
            b_token_type_ids = batch['token_type_ids']
            b_targets = batch['targets']

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)
            b_token_type_ids = b_token_type_ids.to(device)
            b_targets = b_targets.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                b_outputs = self.model(input_ids=b_input_ids, attention_mask=b_attention_mask,
                                       token_type_ids=b_token_type_ids, labels=b_targets)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')
            # get the loss
            total_loss += b_outputs.loss.item()

            b_original_targets = batch['original_targets']
            golden_labels.extend(b_original_targets)

            b_predictions = torch.argmax(F.softmax(b_outputs.logits, dim=1), dim=1)
            b_predictions = b_predictions.cpu().detach().numpy().tolist()
            b_predictions = [dataset.index2label[label] for label in b_predictions]
            predicted_labels.extend(b_predictions)

            for i, premise in enumerate(b_premises):
                output_predictions.append((
                    premise,
                    batch['hypothesis'][i],
                    b_original_targets[i],
                    b_predictions[i]
                ))

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(data_loader)
        print("average loss:", avg_train_loss)
        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(premise_hypothesis))

        # evaluate
        print("Test Accuracy: {}".format(accuracy_score(golden_labels, predicted_labels)))
        print("Test Precision: {}".format(precision_score(golden_labels, predicted_labels, average="weighted")))
        print("Test Recall: {}".format(recall_score(golden_labels, predicted_labels, average="weighted")))
        print("Test F1-Score(weighted average): {}".format(
            f1_score(golden_labels, predicted_labels, average="weighted")))
        print("Test classification Report:\n{}".format(classification_report(
            golden_labels, predicted_labels, digits=10)))
        return output_predictions

    def evaluation_2(self, premises, hypotheses, labels, device, max_length, batch_size=4):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return
        label_count = {label: labels.count(label) for label in labels}
        print("label_count:", label_count)
        dataset = TextualEntailmentDataset2(premises=premises, hypotheses=hypotheses, targets=labels,
                                            label_list=self.label_list, tokenizer=self.tokenizer,
                                            model_type=self.model_type, max_length=max_length)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print(f'#premises:{len(premises)}, #hypotheses:{len(hypotheses)}, #labels:{len(labels)}')
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        total_loss, total_time = 0, 0
        output_predictions = []
        golden_labels, predicted_labels = [], []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_premises = batch['premise']
            b_input_ids = batch['input_ids']
            b_attention_mask = batch['attention_mask']
            b_token_type_ids = batch['token_type_ids']
            b_targets = batch['targets']

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)
            b_token_type_ids = b_token_type_ids.to(device)
            b_targets = b_targets.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                b_outputs = self.model(input_ids=b_input_ids, attention_mask=b_attention_mask,
                                       token_type_ids=b_token_type_ids, labels=b_targets)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')
            # get the loss
            total_loss += b_outputs.loss.item()

            b_original_targets = batch['original_targets']
            golden_labels.extend(b_original_targets)

            b_predictions = torch.argmax(F.softmax(b_outputs.logits, dim=1), dim=1)
            b_predictions = b_predictions.cpu().detach().numpy().tolist()
            b_predictions = [dataset.index2label[label] for label in b_predictions]
            predicted_labels.extend(b_predictions)

            for i, premise in enumerate(b_premises):
                output_predictions.append((
                    premise,
                    batch['hypothesis'][i],
                    b_original_targets[i],
                    b_predictions[i]
                ))

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(data_loader)
        print("average loss:", avg_train_loss)
        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(premises))

        # evaluate
        print("Test Accuracy: {}".format(accuracy_score(golden_labels, predicted_labels)))
        print("Test Precision: {}".format(precision_score(golden_labels, predicted_labels, average="weighted")))
        print("Test Recall: {}".format(recall_score(golden_labels, predicted_labels, average="weighted")))
        print("Test F1-Score(weighted average): {}".format(
            f1_score(golden_labels, predicted_labels, average="weighted")))
        print("Test classification Report:\n{}".format(classification_report(
            golden_labels, predicted_labels, digits=10)))
        return output_predictions

    def mt5_evaluation(self, premise_hypothesis, labels, device, max_length, batch_size=4):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return
        if len(premise_hypothesis) != len(labels):
            print('length of inputs and labels is not equal!!')
            return

        dataset = TextualEntailmentDataset(premises_hypotheses=premise_hypothesis, targets=labels,
                                           label_list=self.label_list, tokenizer=self.tokenizer,
                                           model_type=self.model_type, max_length=max_length)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print(f'#premise_hypothesis:{len(premise_hypothesis)}, #labels:{len(labels)}')
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        total_time = 0
        output_predictions = []
        golden_labels, predicted_labels = [], []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_premises = batch['premise']
            b_input_ids = batch['input_ids']
            b_attention_mask = batch['attention_mask']

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                b_outputs = self.model.generate(input_ids=b_input_ids, attention_mask=b_attention_mask)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')

            b_original_targets = batch['original_targets']
            golden_labels.extend(b_original_targets)

            b_predictions = self.tokenizer.batch_decode(b_outputs, skip_special_tokens=True)
            predicted_labels.extend(b_predictions)

            for i, premise in enumerate(b_premises):
                output_predictions.append((
                    premise,
                    batch['hypothesis'][i],
                    b_original_targets[i],
                    b_predictions[i]
                ))

        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(premise_hypothesis))

        # evaluate
        print("Test Accuracy: {}".format(accuracy_score(golden_labels, predicted_labels)))
        print("Test Precision: {}".format(precision_score(golden_labels, predicted_labels, average="weighted")))
        print("Test Recall: {}".format(recall_score(golden_labels, predicted_labels, average="weighted")))
        print("Test F1-Score(weighted average): {}".format(
            f1_score(golden_labels, predicted_labels, average="weighted")))
        print("Test classification Report:\n{}".format(
            classification_report(golden_labels, predicted_labels, digits=10)))
        return output_predictions

    def evaluation_pair_similarity(self, premises, hypotheses, labels, device, label_map=None, batch_size=4):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return
        if label_map is None:
            label_map = {"0<=score<0.4": "contradiction", "0.4<=score<=0.6": "neutral",
                         "0.6<score<=1": "entailment"}
            print(f"Setting default value for label map: {label_map}")
        elif not all('score' in cond for cond in label_map.keys()):
            print(f"All the key of label_map must contain a condition on `score` variable.\n"
                  f"For example: label_map ={label_map}")
            return

        label_count = {label: labels.count(label) for label in labels}
        print("label_count:", label_count)
        print(f'#premises:{len(premises)}, #hypotheses:{len(hypotheses)}, #labels:{len(labels)}')

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        total_time = 0
        print("Start to evaluate test data ...")

        # Compute the sentence embeddings
        start = time.monotonic()
        premises_embeddings = self.model.encode(premises, convert_to_tensor=True, show_progress_bar=True,
                                                batch_size=batch_size)
        hypotheses_embeddings = self.model.encode(hypotheses, convert_to_tensor=True, show_progress_bar=True,
                                                  batch_size=batch_size)
        end = time.monotonic()
        print(f'time for computing embeddings of premises and hypotheses: {end - start}')
        total_time += end - start

        dataset = TextualEntailmentSimilarityDataset(premises=premises_embeddings, hypotheses=hypotheses_embeddings,
                                                     targets=labels)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print("#batch:", len(data_loader))

        output_predictions = []
        golden_labels, predicted_labels = [], []
        for step, batch in enumerate(data_loader):
            b_premises = batch['premise']
            b_hypotheses = batch['hypothesis']

            # move tensors to GPU if CUDA is available
            b_premises = b_premises.to(device)
            b_hypotheses = b_hypotheses.to(device)

            # Compute the pair-wise cosine similarities
            start = time.monotonic()
            cos_similarity_scores, b_predictions = [], []
            for i in range(len(b_premises)):
                cosine_score = util.pytorch_cos_sim(b_premises[i], b_hypotheses[i]).cpu().detach().numpy()[0][0]
                cos_similarity_scores.append(cosine_score)
                predicted_label = ''
                for exp in label_map:
                    if eval(exp.replace('score', str(cosine_score))):
                        predicted_label = label_map[exp]
                        break
                b_predictions.append(predicted_label)

            end = time.monotonic()
            total_time += end - start
            print(f'time for calculating cosine similarity in step {step}: {end - start}')

            golden_labels.extend(batch['targets'])
            predicted_labels.extend(b_predictions)

            for i, item in enumerate(batch['item']):
                output_predictions.append((
                    premises[item],
                    hypotheses[item],
                    cos_similarity_scores[i],
                    batch['targets'][i],
                    b_predictions[i]
                ))

        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(premises))

        # evaluate
        print("Test Accuracy: {}".format(accuracy_score(golden_labels, predicted_labels)))
        print("Test Precision: {}".format(precision_score(golden_labels, predicted_labels, average="weighted")))
        print("Test Recall: {}".format(recall_score(golden_labels, predicted_labels, average="weighted")))
        print("Test F1-Score(weighted average): {}".format(
            f1_score(golden_labels, predicted_labels, average="weighted")))
        print("Test classification Report:\n{}".format(classification_report(
            golden_labels, predicted_labels, digits=10)))
        return output_predictions


In [6]:
model_name='persiannlp/mt5-small-parsinlu-snli-entailment'
te_model = TextualEntailment(model_name=model_name, model_type="mt5", label_list = ['e', 'c', 'n'])
print(te_model.config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4309802.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=383.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=608.0, style=ProgressStyle(description_…

You are using a model of type t5 to instantiate a model of type mt5. This is not supported for all configurations of models and can yield errors.





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1200773595.0, style=ProgressStyle(descr…




You are using a model of type t5 to instantiate a model of type mt5. This is not supported for all configurations of models and can yield errors.


MT5Config {
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.7.0",
  "use_cache": true,
  "vocab_size": 250112
}



## Sample Inference

In [7]:
premise_list = [
    "این مسابقات بین آوریل و دسامبر در هیپودروم ولیفندی در نزدیکی باکرکی ، ۱۵ کیلومتری (۹ مایل) غرب استانبول برگزار می شود.",
    "آیا کودکانی وجود دارند که نیاز به سرگرمی دارند؟",
    "ما به سفرهایی رفته ایم که در نهرهایی شنا کرده ایم"
]
hypothesis_list = [
    "در ولیفندی هیپودروم، مسابقاتی از آوریل تا دسامبر وجود دارد.",
    "هیچ کودکی هرگز نمی خواهد سرگرم شود.",
    "علاوه بر استحمام در نهرها ، ما به اسپا ها و سونا ها نیز رفته ایم."
]
te_model.mt5_textual_entailment_inference(premise_list, hypothesis_list, device)

['e', 'c', 'c']

## ParsiNLU Dataset

In [8]:
!git clone https://github.com/persiannlp/parsinlu
!ls parsinlu
!ls parsinlu/data/entailment/merged_with_farstail

Cloning into 'parsinlu'...
remote: Enumerating objects: 1434, done.[K
remote: Counting objects: 100% (182/182), done.[K
remote: Compressing objects: 100% (98/98), done.[K
remote: Total 1434 (delta 110), reused 139 (delta 82), pack-reused 1252[K
Receiving objects: 100% (1434/1434), 27.81 MiB | 20.68 MiB/s, done.
Resolving deltas: 100% (913/913), done.
data  LICENSE  README.md  requirements.txt  scripts  src
dev.tsv  test_farstail.tsv  test_natural.tsv  test_translation.tsv  train.tsv


### parsinlu natural subset

In [9]:
test_natural, test_labels = te_model.load_dataset_test_file(dataset_name="parsinlu-natural", dataset_file="./parsinlu/data/entailment/merged_with_farstail/test_natural.tsv")

test part:
 #premise_hypothesis: 850, #label: 850


In [10]:
!nvidia-smi
!lscpu

Wed Aug  4 08:09:15 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P0    26W /  70W |   2276MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [11]:
evaluation_output = te_model.mt5_evaluation(test_natural, test_labels, device, max_length=512, batch_size=128)

#premise_hypothesis:850, #labels:850
#batch: 7
Start to evaluate test data ...
inference time for step 0: 1.4330816120000236
inference time for step 1: 1.345115394000004
inference time for step 2: 1.3485367920000044
inference time for step 3: 1.349104058999984
inference time for step 4: 1.3504447070000083
inference time for step 5: 1.3522648209999772
inference time for step 6: 0.8796992229999887
total inference time: 9.05824660799999
total inference time / #samples: 0.010656760715294106
Test Accuracy: 0.5305882352941177
Test Precision: 0.5232000800320128
Test Recall: 0.5305882352941177
Test F1-Score(weighted average): 0.5143378926133252
Test classification Report:
              precision    recall  f1-score   support

           c  0.4880952381 0.3190661479 0.3858823529       257
           e  0.5275862069 0.4796238245 0.5024630542       319
           n  0.5510204082 0.7883211679 0.6486486486       274

    accuracy                      0.5305882353       850
   macro avg  0.522233951

In [12]:
for premise, hypothesis, true_label, predicted_label in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(premise, hypothesis, true_label, predicted_label))

او را نیز بکشتند و پس از او، هیچ‌کس را نیافتند که شایسته پادشاهی باشد.	او را نیز بکشتند و پس از او پسرش را شایسته ترین فرد برای پادشاهی یافتند.	c	e
نعمت‌زاده همچنین با اشاره به ارتقاء کیفیت خودروهای داخلی طی چند سال گذشته، گفت: با اقدامات صورت گرفته کیفیت خودروهای داخلی رو به بهبود است، اما تا وضع مطلوب هنوز فاصله داریم.	به گفته‌ی وی تولید خودرو در مقایسه با سال‌های گذشته افزایش یافته است.	n	e
صابر دین‌پژوه‌ رتبه اول علوم ریاضی و فنی کنکور سال 96 که از شهر تبریز حائز این رتبه شده بود در مورد موفقیت خود اظهار کرد: پیش از اعلام نتایج و با احتساب درصد‌های کنکور حدس می‌زدم که یکی از رتبه‌های برتر و تک رقمی کنکور را کسب کنم زیرا در کنکورهای‌ آزمایشی در طول سال تحصیلی همواره‌ رتبه‌های تک رقمی و برتر را کسب می‌کردم.	رتبه‌ی اول علوم ریاضی و فنی کنکور از نتیجه‌ی کنکور خود متعجب بود. 	c	n
وی پس از رسیدن به کامپیوتر اصلی وارد ماتریکس شده و نبرد سختی بین او و اسمیت رخ می‌دهد.	در این فیلم شبکه های اینترنتی که در آن زمان پدیدۀ جدیدی بود بخوبی تصویر شده است.	n	n
به‌طور کلی سلسه مراتب و تقسیم‌بندی حیا

In [13]:
output_file_name = "textual_entailment-parsinlu-natural_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for premise, hypothesis, true_label, predicted_label in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(premise, hypothesis, true_label, predicted_label))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### parsinlu mnli subset

In [14]:
test_mnli, test_labels = te_model.load_dataset_test_file(dataset_name="parsinlu-mnli", dataset_file="./parsinlu/data/entailment/merged_with_farstail/test_translation.tsv")

test part:
 #premise_hypothesis: 823, #label: 823


In [15]:
!nvidia-smi
!lscpu

Wed Aug  4 08:09:27 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    30W /  70W |   5592MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [16]:
evaluation_output = te_model.mt5_evaluation(test_mnli, test_labels, device, max_length=512, batch_size=128)

#premise_hypothesis:823, #labels:823
#batch: 7
Start to evaluate test data ...
inference time for step 0: 1.3579768939999894
inference time for step 1: 1.352114026999999
inference time for step 2: 1.353775240999994
inference time for step 3: 1.356374116000012
inference time for step 4: 1.3598289269999952
inference time for step 5: 1.3586798629999919
inference time for step 6: 0.605689936999994
total inference time: 8.744439004999975
total inference time / #samples: 0.01062507777035234
Test Accuracy: 0.5625759416767923
Test Precision: 0.5739776693284128
Test Recall: 0.5625759416767923
Test F1-Score(weighted average): 0.5619107225828178
Test classification Report:
              precision    recall  f1-score   support

           c  0.5755102041 0.4638157895 0.5136612022       304
           e  0.6441947566 0.5910652921 0.6164874552       291
           n  0.4823151125 0.6578947368 0.5565862709       228

    accuracy                      0.5625759417       823
   macro avg  0.5673400244 

In [17]:
for premise, hypothesis, true_label, predicted_label in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(premise, hypothesis, true_label, predicted_label))

آنها به عنوان (الف) رسید جبران خسارت طبقه بندی می شوند (یعنی	چیز خوبی است که آنها در این دسته قرار می گیرند.	n	n
مورد دیگری که باید ببینید ، فیلم خوابیدن با دشمن است.	باید خوابیدن با دشمن را ببینی.	e	e
داستانهای افسانه ای ارزش خود را دارند ، اما پنهان کاری کثیف است.	پنهان کردن چیزها فقط کثیف است ، در حالی که در داستان افسانه ای، جلال وجود دارد	e	c
او یک دستش را به سمت سمندر آورد ، آن را به آرامی نوازش کرد و آن را دوباره روی سینه دیو گذاشت.	سمندر پذیرای حرکات دوستانه او بود.	n	n
بنابراین ، با دقت بسیار زیاد ، افشاگری نهایی فیلم Zapruder این است که ابراهیم Zapruder خود توطئه گر بود.	در فیلم Zapruder فاش می شود که او توطئه گر بود ، این برای عموم شوک آور بود.	n	n
ایده خوبی هست	بهترین ایده ای بود که من شنیده ام.	n	n
جان ، در حالی که هنوز هم به چشمان آدرین نگاه می کرد ، دید که یک تیغه بسیار تیز قفسه سینه کال را شکافت .	تیغی از سینه کال عبور کرد.	e	e
برای به دست آوردن یک تجربه‌ی کامل ، در غروب آفتاب یا خارج از فصل بازدید کنید.	بهتر است در یکشنبه آفتابی یا در فصل کم بازدید کنید.	e	n
می توانید 

In [18]:
output_file_name = "textual_entailment-parsinlu-mnli_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for premise, hypothesis, true_label, predicted_label in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(premise, hypothesis, true_label, predicted_label))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### parsinlu farstail subset

In [19]:
test_farstail, test_labels = te_model.load_dataset_test_file(dataset_name="parsinlu-farstail", dataset_file="./parsinlu/data/entailment/merged_with_farstail/test_farstail.tsv")

test part:
 #premise_hypothesis: 1564, #label: 1564


In [20]:
!nvidia-smi
!lscpu

Wed Aug  4 08:09:38 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P0    36W /  70W |   5464MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [21]:
evaluation_output = te_model.mt5_evaluation(test_farstail, test_labels, device, max_length=512, batch_size=128)

#premise_hypothesis:1564, #labels:1564
#batch: 13
Start to evaluate test data ...
inference time for step 0: 1.3688085420000107
inference time for step 1: 1.369701985000006
inference time for step 2: 1.3718044309999868
inference time for step 3: 1.375699910999998
inference time for step 4: 1.3768119589999799
inference time for step 5: 1.3787808790000042
inference time for step 6: 1.3806280709999896
inference time for step 7: 1.38046392199999
inference time for step 8: 1.3804366909999999
inference time for step 9: 1.3828019009999934
inference time for step 10: 1.3841827370000033
inference time for step 11: 1.3856586249999623
inference time for step 12: 0.3189237849999813
total inference time: 16.854703438999906
total inference time / #samples: 0.010776664602941115
Test Accuracy: 0.760230179028133
Test Precision: 0.7586410932684996
Test Recall: 0.760230179028133
Test F1-Score(weighted average): 0.7592587785818321
Test classification Report:
              precision    recall  f1-score   s

In [22]:
for premise, hypothesis, true_label, predicted_label in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(premise, hypothesis, true_label, predicted_label))

دوران امامت امام صادق علیه السلام، مصادف است با ترجمه آثار یونانی و گسترش مبارزات فکری و ایدئولوژیکی و نیز ظهور مذاهب و مکتب های انحرافی.	امام سجاد (ع) در دورانی امامت کردند که همزمان با ترجمه آثار یونانی، ظهور مذاهب و مکتب های انحرافی بود.	c	c
دوران امامت امام صادق علیه السلام، مصادف است با ترجمه آثار یونانی و گسترش مبارزات فکری و ایدئولوژیکی و نیز ظهور مذاهب و مکتب های انحرافی.	دستگاه فاسد حکومتی با صرف هزینه های هنگفت، سعی در جعل احادیث و ایجاد انحراف در مکتب تشیع کرده است.	n	n
با شهادت امام رضا(ع) مرحله جدیدی از تلاش ائمه آغاز شد که «دوران محنت اهل بیت» نام دارد.	دوران محنت اهل بیت پس از شهادت امام رضا(ع) آغاز گردید.	e	e
با شهادت امام رضا(ع) مرحله جدیدی از تلاش ائمه آغاز شد که «دوران محنت اهل بیت» نام دارد.	بعد از به شهادت رسیدن امام هادی(ع) دوران محنت اهل بیت شروع شد.	c	c
با شهادت امام رضا(ع) مرحله جدیدی از تلاش ائمه آغاز شد که «دوران محنت اهل بیت» نام دارد.	حضرت جواد(ع) در سال ۱۹۵ هجری در مدینه ولادت یافت.	n	n
توحید اَفعالی، باور به اینکه اراده واحدی بر جهان حاکم است. یعنی جز یک 

In [23]:
output_file_name = "textual_entailment-parsinlu-farstail_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for premise, hypothesis, true_label, predicted_label in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(premise, hypothesis, true_label, predicted_label))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()