In [1]:
!gdown --id '1cAthveg1d3MjrKJtMKGzfX3eH8HJ-dQp'
!gdown --id '1yKyuDtXy1a2NJ-I7H4LpR-LpWSdw09P0'
#!gdown --id '1iubn7mjrG_weZYBlKnStH3vG4mzuCB80'
!gdown --id '1sVD2LU44sOCBcsk0jXi2I09-Nw4KYcQm'
!unzip MedNLI_dataset.zip
!unzip MedNLI_fa_dataset.zip
!pip install fastparquet
!pip install datasets

Downloading...
From: https://drive.google.com/uc?id=1cAthveg1d3MjrKJtMKGzfX3eH8HJ-dQp
To: /content/MedNLI_dataset.zip
100% 681k/681k [00:00<00:00, 10.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1yKyuDtXy1a2NJ-I7H4LpR-LpWSdw09P0
To: /content/MedNLI_fa_dataset.zip
100% 1.88M/1.88M [00:00<00:00, 15.6MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1sVD2LU44sOCBcsk0jXi2I09-Nw4KYcQm
From (redirected): https://drive.google.com/uc?id=1sVD2LU44sOCBcsk0jXi2I09-Nw4KYcQm&confirm=t&uuid=d4408faa-fad1-4c26-81ed-89fe722c2f49
To: /content/mnlimodel.pth
100% 1.12G/1.12G [00:09<00:00, 120MB/s]
Archive:  MedNLI_dataset.zip
replace MedNLI_dataset/valid-00000-of-00001-cc552de6d1a6fa4b.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: MedNLI_dataset/valid-00000-of-00001-cc552de6d1a6fa4b.parquet  
  inflating: MedNLI_dataset/train-00000-of-00001-210cfe9263b99806.parquet  
  inflating: MedNLI_dataset/test-00000-of-00001-47685aa42db61e77.parquet  
Archive:  MedNL

In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset,DataLoader
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification,DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
import torch
from datasets import Dataset as DS

In [3]:
en_test_data = pd.read_parquet('MedNLI_dataset/test-00000-of-00001-47685aa42db61e77.parquet', engine='fastparquet')
en_train_data = pd.read_parquet('MedNLI_dataset/train-00000-of-00001-210cfe9263b99806.parquet', engine='fastparquet')
en_valid_data = pd.read_parquet('MedNLI_dataset/valid-00000-of-00001-cc552de6d1a6fa4b.parquet', engine='fastparquet')

fa_test_data = pd.read_excel('MedNLI_fa_dataset/mednlitest_fa.xlsx')
fa_train_data = pd.read_excel('MedNLI_fa_dataset/mednlitrain_fa.xlsx')
fa_valid_data = pd.read_excel('MedNLI_fa_dataset/mednlidev_fa.xlsx')

In [11]:
def get_label_from_str( st):
    if st == 'entailment':
       return [1,0,0]
    elif st == 'neutral':
       return [0,1,0]
    elif st == 'contradiction':
       return [0,0,1]
    else:
        print('should not get here')

In [12]:
def find_pre_and_hyp(query):
    start_pre = query.find("[PRE]") + len("[PRE]")
    end_pre = query.find("[HYP]")
    start_hyp = query.find("[HYP]") + len("[HYP]")
    end_hyp = query.find("OUTPUT:")
    premise = query[start_pre:end_pre].strip()
    hypothesis = query[start_hyp:end_hyp].strip()

    return premise,hypothesis

In [14]:
train_data = []
for index,row in fa_train_data.iterrows():
    label = get_label_from_str(row['answer'])
    pre,hyp = find_pre_and_hyp(row['query'])
    d = {'premise': pre,
        'hypothesis': hyp,
        'label': label}
    train_data.append(d)
for index,row in en_train_data.iterrows():
    label = get_label_from_str(row['answer'])
    pre,hyp = find_pre_and_hyp(row['query'])
    d = {'premise': pre,
        'hypothesis': hyp,
        'label': label}
    train_data.append(d)
train_df = pd.DataFrame(train_data)
train_ds = DS.from_pandas(train_df)

In [15]:
val_data = []
for index,row in fa_valid_data.iterrows():
    label = get_label_from_str(row['answer'])
    pre,hyp = find_pre_and_hyp(row['query'])
    d = {'premise': pre,
        'hypothesis': hyp,
        'label': label}
    val_data.append(d)
for index,row in en_valid_data.iterrows():
    label = get_label_from_str(row['answer'])
    pre,hyp = find_pre_and_hyp(row['query'])
    d = {'premise': pre,
        'hypothesis': hyp,
        'label': label}
    val_data.append(d)
val_df = pd.DataFrame(val_data)
val_ds = DS.from_pandas(val_df)

In [17]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

def tokenize_function(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], padding="max_length", truncation=True)

tokenized_val_ds = val_ds.map(tokenize_function, batched=True)
tokenized_train_ds = train_ds.map(tokenize_function, batched=True)


# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=7,
    per_device_eval_batch_size=7,
    num_train_epochs=1,
    weight_decay=0.01,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
)

# Fine-tune the model
trainer.train()

# Save the model
trainer.save_model("./fine_tuned_mednli_model")

Map:   0%|          | 0/2790 [00:00<?, ? examples/s]

Map:   0%|          | 0/22464 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.4883,0.477


In [6]:
def get_max_indx(dictionary):
    if not dictionary:
        return None
    max_key = max(dictionary, key=dictionary.get)
    return max_key

In [7]:
def inference(premise,hypothesis):
    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    with torch.no_grad():
         output = model(input["input_ids"].to(device))
         #print(output)
         prediction = torch.softmax(output["logits"][0], -1).tolist()
         #print(prediction)
         label_names = ["entailment", "neutral", "contradiction"]
         prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return get_max_indx(prediction)

In [8]:
def get_model_acc(model,datas):
    sum = 0
    count = 0
    for index,row in datas.iterrows():
        pre,hyp = find_pre_and_hyp(row['query'])
        ground_truth = row['answer']
        model_answer = inference(pre,hyp)

        count += 1
        if model_answer ==  ground_truth:
           sum += 1
    return (sum/count)

In [26]:
model.eval()
print('Accuracy on english test_set is: ' + str(get_model_acc(model,en_test_data)))
print('Accuracy on english valid_set is: ' + str(get_model_acc(model,en_valid_data)))
print('Accuracy on english trian_set is: ' + str(get_model_acc(model,en_train_data)))

Accuracy on english test_set is: 0.8129395218002813
Accuracy on english valid_set is: 0.8458781362007168
Accuracy on english trian_set is: 0.9001958689458689


In [27]:
model.eval()
print('Accuracy on persian test_set is: ' + str(get_model_acc(model,fa_test_data)))
print('Accuracy on persian valid_set is: ' + str(get_model_acc(model,fa_valid_data)))
print('Accuracy on persian trian_set is: ' + str(get_model_acc(model,fa_train_data)))

Accuracy on persian test_set is: 0.7946554149085795
Accuracy on persian valid_set is: 0.8093189964157707
Accuracy on persian trian_set is: 0.8682336182336182


In [21]:
from google.colab import drive
import shutil
drive.mount('/content/drive')
torch.save(model,'mnlimodel.pth')
source_file_path = "mnlimodel.pth"
destination_folder = "/content/drive/My Drive/mnlimodel.pth"
shutil.copy(source_file_path, destination_folder)

Mounted at /content/drive


'/content/drive/My Drive/mnlimodel.pth'