In [1]:
!gdown --id '1cAthveg1d3MjrKJtMKGzfX3eH8HJ-dQp'
!gdown --id '1yKyuDtXy1a2NJ-I7H4LpR-LpWSdw09P0'
!unzip MedNLI_dataset.zip
!unzip MedNLI_fa_dataset.zip
!pip install fastparquet

Downloading...
From: https://drive.google.com/uc?id=1cAthveg1d3MjrKJtMKGzfX3eH8HJ-dQp
To: /content/MedNLI_dataset.zip
100% 681k/681k [00:00<00:00, 86.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1yKyuDtXy1a2NJ-I7H4LpR-LpWSdw09P0
To: /content/MedNLI_fa_dataset.zip
100% 1.88M/1.88M [00:00<00:00, 188MB/s]
Archive:  MedNLI_dataset.zip
replace MedNLI_dataset/valid-00000-of-00001-cc552de6d1a6fa4b.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: MedNLI_dataset/valid-00000-of-00001-cc552de6d1a6fa4b.parquet  
replace MedNLI_dataset/train-00000-of-00001-210cfe9263b99806.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: MedNLI_dataset/train-00000-of-00001-210cfe9263b99806.parquet  
replace MedNLI_dataset/test-00000-of-00001-47685aa42db61e77.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: MedNLI_dataset/test-00000-of-00001-47685aa42db61e77.parquet  
Archive:  MedNLI_fa_dataset.zip
   creating: MedNLI_fa_dataset/
  inflating: MedNLI_fa_dataset/me

In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset,DataLoader
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [4]:
en_test_data = pd.read_parquet('MedNLI_dataset/test-00000-of-00001-47685aa42db61e77.parquet', engine='fastparquet')
en_train_data = pd.read_parquet('MedNLI_dataset/train-00000-of-00001-210cfe9263b99806.parquet', engine='fastparquet')
en_valid_data = pd.read_parquet('MedNLI_dataset/valid-00000-of-00001-cc552de6d1a6fa4b.parquet', engine='fastparquet')

fa_test_data = pd.read_excel('MedNLI_fa_dataset/mednlitest_fa.xlsx')
fa_train_data = pd.read_excel('MedNLI_fa_dataset/mednlitrain_fa.xlsx')
fa_valid_data = pd.read_excel('MedNLI_fa_dataset/mednlidev_fa.xlsx')

In [5]:
def find_pre_and_hyp(query):
    start_pre = query.find("[PRE]") + len("[PRE]")
    end_pre = query.find("[HYP]")
    start_hyp = query.find("[HYP]") + len("[HYP]")
    end_hyp = query.find("OUTPUT:")
    premise = query[start_pre:end_pre].strip()
    hypothesis = query[start_hyp:end_hyp].strip()

    return premise,hypothesis

In [10]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
def get_max_indx(dictionary):
    if not dictionary:
        return None
    max_key = max(dictionary, key=dictionary.get)
    return max_key

In [12]:
def inference(premise,hypothesis):
    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    with torch.no_grad():
         output = model(input["input_ids"].to(device))
         prediction = torch.softmax(output["logits"][0], -1).tolist()
         label_names = ["entailment", "neutral", "contradiction"]
         prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return get_max_indx(prediction)

In [13]:
def get_model_acc(model,datas):
    sum = 0
    count = 0
    for index,row in datas.iterrows():
        pre,hyp = find_pre_and_hyp(row['query'])
        ground_truth = row['answer']
        model_answer = inference(pre,hyp)

        count += 1
        if model_answer ==  ground_truth:
           sum += 1
    return (sum/count)

In [19]:
model.eval()
print('Accuracy on english is: ' + str(get_model_acc(model,en_test_data)))

Accuracy on english is: 0.670182841068917


In [20]:
model.eval()
print('Accuracy on persian is: ' + str(get_model_acc(model,fa_test_data)))

Accuracy on persian is: 0.6385372714486639
