In [1]:
!nvidia-smi
!lscpu

Tue Jul 27 10:21:54 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install hazm==0.7.0
!pip install seqeval==1.2.2
!pip install sentencepiece==0.1.96
!pip install sentence-transformers==2.0.0
!pip install transformers==4.7.0

Collecting hazm==0.7.0
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[?25l[K     |█                               | 10 kB 24.0 MB/s eta 0:00:01[K     |██                              | 20 kB 29.3 MB/s eta 0:00:01[K     |███                             | 30 kB 34.0 MB/s eta 0:00:01[K     |████▏                           | 40 kB 36.0 MB/s eta 0:00:01[K     |█████▏                          | 51 kB 37.9 MB/s eta 0:00:01[K     |██████▏                         | 61 kB 40.8 MB/s eta 0:00:01[K     |███████▎                        | 71 kB 23.8 MB/s eta 0:00:01[K     |████████▎                       | 81 kB 24.4 MB/s eta 0:00:01[K     |█████████▎                      | 92 kB 25.1 MB/s eta 0:00:01[K     |██████████▍                     | 102 kB 26.1 MB/s eta 0:00:01[K     |███████████▍                    | 112 kB 26.1 MB/s eta 0:00:01[K     |████████████▍                   | 122 kB 26.1 MB/s eta 0:00:01[K     |█████████████▌                  | 133 kB 26.1 MB/s eta 

In [3]:
!pip install PyDrive
import os
import IPython.display as ipd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [4]:
import re
import gc
import os
import hazm
import time
import json
import logging
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import transformers
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification

from sentence_transformers import models, SentenceTransformer, util, evaluation

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print()
print('numpy', np.__version__)
print('pandas', pd.__version__)
print('transformers', transformers.__version__)
print('torch', torch.__version__)
print()

# load rouge for validation
# rouge = datasets.load_metric("rouge")

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")



numpy 1.19.5
pandas 1.1.5
transformers 4.7.0
torch 1.9.0+cu102

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [5]:
class TextualThematicSimilarityDataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Textual Thematic Similarity. """

    def __init__(self, sentences_1, sentences_2, targets, tokenizer, model_architecture, max_length):
        self.sentences_1 = sentences_1
        self.sentences_2 = sentences_2
        self.targets = targets
        self.tokenizer = tokenizer
        self.model_architecture = model_architecture
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences_1)

    def __getitem__(self, item):
        if self.model_architecture == "BertForSequenceClassification":
            encoding = self.tokenizer(
                [(self.sentences_1[item], self.sentences_2[item])],
                add_special_tokens=True,
                max_length=self.max_length,
                truncation=True,
                padding='max_length',
                return_tensors="pt"
            )
            inputs = {
                'sentence_1': self.sentences_1[item],
                'sentence_2': self.sentences_2[item],
                'targets': self.targets[item],
                'input_ids': encoding.input_ids.flatten(),
                'attention_mask': encoding.attention_mask.flatten(),
                'token_type_ids': encoding['token_type_ids'].flatten()
            }
            return inputs
        elif self.model_architecture == "sentence-transformer":
            inputs = {
                'item': item,
                'sentence_1': self.sentences_1[item],
                'sentence_2': self.sentences_2[item],
                'targets': self.targets[item]
            }
            return inputs
        return {}


class TextualThematicSimilarity:
    def __init__(self, model_name, model_architecture, label2id=None):
        self.normalizer = hazm.Normalizer()
        self.model_name = model_name
        self.model_architecture = model_architecture
        if self.model_architecture == "BertForSequenceClassification":
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
            self.config = AutoConfig.from_pretrained(self.model_name)
            self.label2id = label2id
            self.id2label = {i: l for l, i in label2id.items()}
        elif self.model_architecture == "sentence-transformer":
            word_embedding_model = models.Transformer(self.model_name)
            pooling_model = models.Pooling(
                word_embedding_model.get_word_embedding_dimension(),
                pooling_mode_mean_tokens=True,
                pooling_mode_cls_token=False,
                pooling_mode_max_tokens=False)
            self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
            self.config = AutoConfig.from_pretrained(self.model_name)

    def load_dataset_test_file(self, dataset_name, dataset_file, **kwargs):
        if dataset_name.lower() == "wiki-d-similar":
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            data = pd.read_csv(dataset_file, delimiter="\t")

            # cleaning labels
            valid_labels = ['dissimilar', 'similar']
            data['Label'] = data['Label'].apply(lambda r: r if r in valid_labels else None)
            data = data.dropna(subset=['Label'])
            data = data.reset_index(drop=True)

            sentence1_list, sentence2_list = data['Sentence1'].values.tolist(), data['Sentence2'].values.tolist()
            labels = data['Label'].values.tolist()
            print(f'test part:\n #sentence1: {len(sentence1_list)}, #sentence2: {len(sentence2_list)}, '
                  f'#labels: {len(labels)}')
            return sentence1_list, sentence2_list, labels
        if dataset_name.lower() == "wiki-triplet":
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            data = pd.read_csv(dataset_file, delimiter="\t")
            sentence1_list = data['Sentence1'].values.tolist()
            sentence2_list = data['Sentence2'].values.tolist()
            sentence3_list = data['Sentence3'].values.tolist()
            print(f'test part:\n #sentence1: {len(sentence1_list)}, #sentence2: {len(sentence2_list)}, '
                  f'#sentence3: {len(sentence3_list)}')
            return sentence1_list, sentence2_list, sentence3_list

    def thematic_similarity_inference_seq_classification(self, sentences_1, sentences_2, device, max_length):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        new_input = []
        for s1, s2 in zip(sentences_1, sentences_2):
            new_input.append((s1, s2))

        tokenized_batch = self.tokenizer(
            new_input,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        tokenized_batch = tokenized_batch.to(device)
        outputs = self.model(**tokenized_batch)
        pt_predictions = torch.argmax(F.softmax(outputs.logits, dim=1), dim=1)
        pt_predictions = pt_predictions.cpu().detach().numpy().tolist()

        output_predictions = []
        for i, sent1 in enumerate(sentences_1):
            output_predictions.append(
                (sent1, sentences_2[i], pt_predictions[i], self.id2label[pt_predictions[i]])
            )
        return output_predictions

    def thematic_similarity_inference_pair_similarity(self, sentences_1, sentences_2, device, label_list,
                                                      similarity_threshold=0.5):
        if not self.model:
            print('Something wrong has been happened!')
            return

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        # Compute the sentence embeddings
        sent1_embeddings = self.model.encode(sentences_1, convert_to_tensor=True, show_progress_bar=True)
        sent2_embeddings = self.model.encode(sentences_2, convert_to_tensor=True, show_progress_bar=True)

        # Compute the pair-wise cosine similarities
        similarity_scores, predicted_labels = [], []
        for i in range(len(sentences_1)):
            cos_scores = util.pytorch_cos_sim(sent1_embeddings[i], sent2_embeddings[i]).cpu().detach().numpy()
            similarity_scores.append(cos_scores[0][0])
            if cos_scores[0][0] >= similarity_threshold:
                predicted_labels.append(label_list[1])
            else:
                predicted_labels.append(label_list[0])

        output_predictions = []
        for i, sent1 in enumerate(sentences_1):
            output_predictions.append(
                (sent1, sentences_2[i], similarity_scores[i], predicted_labels[i])
            )
        return output_predictions

    def evaluation_seq_classification(self, sentence1_list, sentence2_list, labels, device, max_length, batch_size=4):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return
        label_count = {label: labels.count(label) for label in labels}
        print("label_count:", label_count)

        # convert labels
        new_labels = [self.label2id[_] for _ in labels]
        dataset = TextualThematicSimilarityDataset(sentences_1=sentence1_list, sentences_2=sentence2_list,
                                                   targets=new_labels, tokenizer=self.tokenizer,
                                                   model_architecture=self.model_architecture, max_length=max_length)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print(f'#sentence1:{len(sentence1_list)}, #sentence2:{len(sentence2_list)}, #labels:{len(labels)}')
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        total_loss, total_time = 0, 0
        output_predictions = []
        golden_labels, predicted_labels = [], []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_sentence_1 = batch['sentence_1']
            b_input_ids = batch['input_ids']
            b_attention_mask = batch['attention_mask']
            b_token_type_ids = batch['token_type_ids']
            b_targets = batch['targets']

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)
            b_token_type_ids = b_token_type_ids.to(device)
            b_targets = b_targets.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                b_outputs = self.model(input_ids=b_input_ids, attention_mask=b_attention_mask,
                                       token_type_ids=b_token_type_ids, labels=b_targets)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')
            # get the loss
            total_loss += b_outputs.loss.item()

            golden_labels.extend([self.id2label[_.item()] for _ in b_targets])

            b_predictions = torch.argmax(F.softmax(b_outputs.logits, dim=1), dim=1)
            b_predictions = b_predictions.cpu().detach().numpy().tolist()
            b_predictions = [self.id2label[_] for _ in b_predictions]
            predicted_labels.extend(b_predictions)

            for i, sent1 in enumerate(b_sentence_1):
                output_predictions.append((
                    sent1,
                    batch['sentence_2'][i],
                    self.id2label[b_targets[i].item()],
                    b_predictions[i]
                ))

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(data_loader)
        print("average loss:", avg_train_loss)
        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(sentence1_list))

        # evaluate
        print("Test Accuracy: {}".format(accuracy_score(golden_labels, predicted_labels)))
        print("Test Precision: {}".format(precision_score(golden_labels, predicted_labels, average="weighted")))
        print("Test Recall: {}".format(recall_score(golden_labels, predicted_labels, average="weighted")))
        print("Test F1-Score(weighted average): {}".format(
            f1_score(golden_labels, predicted_labels, average="weighted")))
        print("Test classification Report:\n{}".format(classification_report(
            golden_labels, predicted_labels, digits=10)))
        return output_predictions

    def evaluation_pair_similarity(self, sentence1_list, sentence2_list, labels, device, max_length, label_list,
                                   batch_size=4, similarity_threshold=0.5):
        if not self.model:
            print('Something wrong has been happened!')
            return
        label_count = {label: labels.count(label) for label in labels}
        print("label_count:", label_count)

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        total_time = 0
        print("Start to evaluate test data ...")

        # Compute the sentence embeddings
        start = time.monotonic()
        sent1_embeddings = self.model.encode(sentence1_list, convert_to_tensor=True, show_progress_bar=True,
                                             batch_size=batch_size)
        sent2_embeddings = self.model.encode(sentence2_list, convert_to_tensor=True, show_progress_bar=True,
                                             batch_size=batch_size)

        end = time.monotonic()
        total_time += end - start
        print(f'time for computing sentence embeddings: {end - start}')

        # # convert labels
        # new_labels = [self.label2id[_] for _ in labels]
        dataset = TextualThematicSimilarityDataset(sentences_1=sent1_embeddings, sentences_2=sent2_embeddings,
                                                   targets=labels, tokenizer=None,
                                                   model_architecture=self.model_architecture, max_length=max_length)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print(f'#sentence1:{len(sentence1_list)}, #sentence2:{len(sentence2_list)}, #labels:{len(labels)}')
        print("#batch:", len(data_loader))

        output_predictions = []
        golden_labels, predicted_labels = [], []
        for step, batch in enumerate(data_loader):
            b_sentence_1 = batch['sentence_1']
            b_sentence_2 = batch['sentence_2']

            # move tensors to GPU if CUDA is available
            b_sentence_1 = b_sentence_1.to(device)
            b_sentence_2 = b_sentence_2.to(device)

            # Compute the pair-wise cosine similarities
            # similarity_scores, predicted_labels = [], []
            start = time.monotonic()
            cos_similarity_scores, b_predictions = [], []
            for i in range(len(b_sentence_1)):
                cos_scores = util.pytorch_cos_sim(b_sentence_1[i], b_sentence_2[i]).cpu().detach().numpy()
                cos_similarity_scores.append(cos_scores[0][0])
                if cos_scores[0][0] >= similarity_threshold:
                    b_predictions.append(label_list[1])
                else:
                    b_predictions.append(label_list[0])
            end = time.monotonic()
            total_time += end - start
            print(f'time for calculating cosine similarity in step {step}: {end - start}')

            golden_labels.extend(batch['targets'])
            predicted_labels.extend(b_predictions)

            for i, item in enumerate(batch['item']):
                output_predictions.append((
                    sentence1_list[item],
                    sentence2_list[item],
                    cos_similarity_scores[i],
                    batch['targets'][i],
                    b_predictions[i]
                ))

        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(sentence1_list))

        # evaluate
        print("Test Accuracy: {}".format(accuracy_score(golden_labels, predicted_labels)))
        print("Test Precision: {}".format(precision_score(golden_labels, predicted_labels, average="weighted")))
        print("Test Recall: {}".format(recall_score(golden_labels, predicted_labels, average="weighted")))
        print("Test F1-Score(weighted average): {}".format(
            f1_score(golden_labels, predicted_labels, average="weighted")))
        print("Test classification Report:\n{}".format(classification_report(
            golden_labels, predicted_labels, digits=10)))
        return output_predictions

    def evaluation_pair_similarity_2(self, sentence1_list, sentence2_list, labels, device, label_list, batch_size=4):
        if not self.model:
            print('Something wrong has been happened!')
            return

        label_count = {label: labels.count(label) for label in labels}
        print("label_count:", label_count)
        new_labels = [label_list.index(l) for l in labels]

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        print("Start to evaluate test data ...")
        start = time.monotonic()
        evaluator = evaluation.BinaryClassificationEvaluator(
            sentences1=sentence1_list, sentences2=sentence2_list, labels=new_labels, name="Wiki d/similar",
            batch_size=batch_size, show_progress_bar=True, write_csv=True
        )
        output_scores = evaluator.compute_metrices(self.model)
        end = time.monotonic()
        print(f'total time: {end - start}')

        return output_scores

    def evaluation_triplet_similarity(self, sentence1_list, sentence2_list, sentence3_list, device, batch_size=4):
        """
        Given (sentence, positive_example, negative_example), checks if
        distance(sentence,positive_example) < distance(sentence, negative_example).
        """
        logging.basicConfig(level=logging.DEBUG)
        if not self.model:
            print('Something wrong has been happened!')
            return

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        print("Start to evaluate test data ...")
        start = time.monotonic()
        evaluator = evaluation.TripletEvaluator(
            anchors=sentence1_list, positives=sentence2_list, negatives=sentence3_list, name="wiki triplet",
            batch_size=batch_size, show_progress_bar=True, write_csv=True
        )
        output_scores = evaluator(self.model, output_path='.')
        end = time.monotonic()
        print(f'total time: {end - start}')

        return output_scores


In [7]:
model_name = 'm3hrdadfi/bert-fa-base-uncased-wikitriplet-mean-tokens'
tts_model = TextualThematicSimilarity(model_name=model_name, model_architecture="sentence-transformer")
print(tts_model.config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=403.0, style=ProgressStyle(description_…


BertConfig {
  "_name_or_path": "HooshvareLab/bert-fa-base-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.7.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



## Wiki Triplet dataset v1.0.0

In [8]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1-lfrhHZwleYR4s0xGkXZPXxTeF25Q4C3'})
download.GetContentFile('wiki-triplet.zip')
!ls

adc.json  sample_data  wiki-triplet.zip


In [9]:
!unzip wiki-triplet.zip
!ls
!ls wiki-triplet

Archive:  wiki-triplet.zip
   creating: wiki-triplet/
  inflating: wiki-triplet/dev.csv    
  inflating: wiki-triplet/train.csv  
  inflating: wiki-triplet/wiki-triplet.csv  
  inflating: wiki-triplet/test.csv   
adc.json  sample_data  wiki-triplet  wiki-triplet.zip
dev.csv  test.csv  train.csv  wiki-triplet.csv


In [10]:
sentences_1, sentences_2, sentences_3 = tts_model.load_dataset_test_file(dataset_name="wiki-triplet", dataset_file="./wiki-triplet/test.csv")
print(len(sentences_1), len(sentences_2), len(sentences_3))
print(sentences_1[0])
print(sentences_2[0])
print(sentences_3[0])

test part:
 #sentence1: 5758, #sentence2: 5758, #sentence3: 5758
5758 5758 5758
چندین سال پس از زلزله ، شیخ جنید رازی که قصد سفر از ری به مشهد داشت با عبور از منطقه خالی از سکنه پیشوا ، با مزار امامزاده جعفر برخورد کرد و بر آن شد تا اطلاعاتی در مورد امامزاده جعفر بدست آورد و هنگامی که با مقامات و کرامات امامزاده جعفر آشنا شد تصمیم گرفت خادم آن امامزاده شود ، بنابراین ساکن این منطقه شد ، دارایی هایش را فروخت و پنج قنات و یک میلیون و ۵۵۵ هزار متر مربع ملک را در اطراف آرامگاه خریداری کرد و آباد ساخت و در سال ۸۷۳ هجری قمری وقف نمود و فرزندانش را موظف نمود که خادم امامزاده و متولی موقوفات باشند .
در عصر صفوی به فقه شیعه و هنر و معماری اسلامی و ایرانی بسیار بها داده می شد ، شهر امامزاده جعفر نیز به دلیل وجود امامزاده رونق ویژه ای پیدا کرد .
طی این مراسم ابتدا فردی در نقش زین العابدین (امام چهارم شیعیان) ، کشته شدگان را برای حاضرین معرفی می کند و از دلاوری های آنان می گوید و سپس مرثیه آنان را می خواند و در پایان هر بخش این کشته شدگان از در ورودی اصلی صحن تا ایوان بر روی دست افراد پوشیده با لب

In [11]:
!nvidia-smi
!lscpu

Tue Jul 27 10:24:19 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8    10W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [12]:
evaluation_scores_wikitriplet = tts_model.evaluation_triplet_similarity(
    sentences_1, sentences_2, sentences_3, device, batch_size=256
)

INFO:sentence_transformers.evaluation.TripletEvaluator:TripletEvaluator: Evaluating the model on wiki triplet dataset:


Start to evaluate test data ...


HBox(children=(FloatProgress(value=0.0, description='Batches', max=23.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=23.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=23.0, style=ProgressStyle(description_width…

INFO:sentence_transformers.evaluation.TripletEvaluator:Accuracy Cosine Distance:   	93.33
INFO:sentence_transformers.evaluation.TripletEvaluator:Accuracy Manhatten Distance:	93.40
INFO:sentence_transformers.evaluation.TripletEvaluator:Accuracy Euclidean Distance:	93.31




total time: 51.06341564000002


In [13]:
evaluation_scores_wikitriplet

0.9340048627995832

In [17]:
!ls

 adc.json     'triplet_evaluation_wiki triplet_results.csv'   wiki-triplet.zip
 sample_data   wiki-triplet


In [18]:
!cat 'triplet_evaluation_wiki triplet_results.csv'

epoch,steps,accuracy_cosinus,accuracy_manhatten,accuracy_euclidean
-1,-1,0.933310177144842,0.9340048627995832,0.9331365057311567
