### Load the Dependencies

In [None]:
!pip install sentence_transformers
!pip install datasets


### Load the Data from huggingface

In [2]:
from datasets import load_dataset


In [3]:
dataset = load_dataset('medical_questions_pairs')


Downloading builder script:   0%|          | 0.00/2.83k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.98k [00:00<?, ?B/s]

Downloading and preparing dataset medical_questions_pairs/default to /root/.cache/huggingface/datasets/medical_questions_pairs/default/0.0.0/db30a35b934dceb7abed5ef6b73a432bb59682d00e26f9a1acd960635333bc80...


Downloading data:   0%|          | 0.00/174k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3048 [00:00<?, ? examples/s]

Dataset medical_questions_pairs downloaded and prepared to /root/.cache/huggingface/datasets/medical_questions_pairs/default/0.0.0/db30a35b934dceb7abed5ef6b73a432bb59682d00e26f9a1acd960635333bc80. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
dataset = dataset['train']


In [7]:
dataset[0]

{'dr_id': 1,
 'question_1': 'After how many hour from drinking an antibiotic can I drink alcohol?',
 'question_2': 'I have a party tonight and I took my last dose of Azithromycin this morning. Can I have a few drinks?',
 'label': 1}

In [13]:
len(dataset)

3048

In [15]:
num_examples = len(dataset)
column_names = dataset.column_names

### Finetune the model

In [8]:

from sentence_transformers import SentenceTransformer, InputExample, losses
import pandas as pd
import numpy as np

# Load the SBERT model
model = SentenceTransformer('bert-base-nli-mean-tokens')

Downloading (…)821d1/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading (…)d1/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)01e821d1/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)821d1/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1e821d1/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [16]:
# Create a list of InputExample objects
train_examples = []
for i in range(0,int(0.8*num_examples),1):
  row = dataset[i]
  labelf = float(row['label'])
  train_examples.append(InputExample(texts=[row['question_1'], row['question_2']], label=labelf))

test_examples = []
for i in range(int(0.8*num_examples),int(num_examples),1):
  row = dataset[i]
  labelf = float(row['label'])
  test_examples.append(InputExample(texts=[row['question_1'], row['question_2']], label=labelf))



In [17]:
print(len(train_examples),len(test_examples))

2438 610


In [18]:
# Define the training loss function
train_loss = losses.CosineSimilarityLoss(model)

In [27]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
val_dataloader = DataLoader(test_examples, shuffle=True, batch_size=16)


In [20]:
# Define your fine-tuning parameters
num_epochs = 3
warmup_steps = 100
learning_rate = 2e-5
# Define your loss function
train_loss = losses.CosineSimilarityLoss(model=model)

In [28]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=2, warmup_steps=100)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/153 [00:00<?, ?it/s]

Iteration:   0%|          | 0/153 [00:00<?, ?it/s]

In [30]:
from sentence_transformers import evaluation
from sentence_transformers import util

In [33]:
preds_new = []
for i in range(int(0.8*num_examples),int(num_examples),1):
  row = dataset[i]
  labelf = float(row['label'])
  sentences1 = row['question_1']
  sentences2 = row['question_2']
  #test_examples.append(InputExample(texts=[row['question_1'], row['question_2']], label=labelf))
  #Compute embedding for both lists
  embeddings1 = model.encode(sentences1, convert_to_tensor=True)
  embeddings2 = model.encode(sentences2, convert_to_tensor=True)

  #Compute cosine-similarities
  cosine_scores = util.cos_sim(embeddings1, embeddings2)
  preds_new.append(cosine_scores.item())

### Compare performance with the original model

In [34]:
model_old = SentenceTransformer('bert-base-nli-mean-tokens')

In [35]:
preds_old = []
labels = []
for i in range(int(0.8*num_examples),int(num_examples),1):
  row = dataset[i]
  labelf = float(row['label'])
  sentences1 = row['question_1']
  sentences2 = row['question_2']
  #test_examples.append(InputExample(texts=[row['question_1'], row['question_2']], label=labelf))
  #Compute embedding for both lists
  embeddings1 = model_old.encode(sentences1, convert_to_tensor=True)
  embeddings2 = model_old.encode(sentences2, convert_to_tensor=True)

  #Compute cosine-similarities
  cosine_scores = util.cos_sim(embeddings1, embeddings2)
  preds_old.append(cosine_scores.item())
  labels.append(labelf)

In [37]:
labels[0]

1.0

In [38]:
### Generally we shuld have improvement in cosine similarity for positive labels 
### And decrease in performance for negative or zero labels
check_improvement = 0
for i in range(len(preds_old)):
  if labels[i]>0.5:
    check_improvement += preds_new[i] - preds_old[i]
  else:
    check_improvement += preds_old[i] - preds_new[i] 



In [39]:
### Check percentage improvement
check_improvement/len(labels)

0.13154137970299506