In [None]:
!pip install datasets nltk transformers torch PyTorch

clear_output()

In [1]:
from IPython.display import clear_output
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
import warnings
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import BertTokenizer, BertModel
import torch

# Suppress all warnings in cells printouts for clear output
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Load dataset

In [2]:
dataset = load_dataset("medical_questions_pairs")

for split in dataset:
    print(split)



train


# Parse and clean dataset


In [3]:
df = pd.DataFrame(dataset['train'])
df.head()

Unnamed: 0,dr_id,question_1,question_2,label
0,1,After how many hour from drinking an antibioti...,I have a party tonight and I took my last dose...,1
1,1,After how many hour from drinking an antibioti...,I vomited this morning and I am not sure if it...,0
2,1,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 19...,1
3,1,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or v...,0
4,1,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache wi...,0


In [4]:
question_1 = 'question_1'
question_2 = 'question_2' # column names as vars for convenienct

In [5]:
df.drop('dr_id', axis=1, inplace=True)  # keep only question pairs and labels. Label 1 means match

In [6]:
df.shape

(3048, 3)

Properties of a good set:
1. No paired duplicates (no rows with same q1 and q2 values as a pair). Ensure removal of inverse duplication:
   q1,q2 and q2,q1
2. No NaN
3. Many to many relation is possible

In [7]:
# Check for NaN and remove
nan_cols = df.isnull().any()
print(nan_cols)

question_1    False
question_2    False
label         False
dtype: bool


In [8]:
# Remove paired duplicates 
df['isometric_pair'] = df.apply(lambda x: tuple(sorted([x[question_1], x[question_2]])), axis=1)

# Remove duplicates based on the normalized pairs
df = df.drop_duplicates(subset=['isometric_pair'])

# Drop the auxiliary column
df = df.drop(columns=['isometric_pair'])

In [9]:
df.shape  # see if something was removed

(3048, 3)

Outcome: clean dataset without pairs of duplicates

# Embedding: transform each question into vector

    1. Tokenize questions to words
    2. Embed words to vectors with BERT
    3. Make an average vectors to represent the sentences (or weighted average: like TF-IDF scores, giving more importance to certain words.)    
    4. Collect vectorized Q1 and Q2 questions sets into two dataframes
    5. Precalculate table of distances for each q1 to each q2
    6. For specific q1, get its distances vector from all Q2 and select 5 closest
    7. Based on indices, return the original Q2 questions


In [10]:
def tokenize(question):
    """ Break question sentence to word tokens without punctuation """
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(question) # split
    tokens = [token.lower() for token in tokens] # decapitalize
     
    #tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # native BERT format preserves punctuation as well, not needed
    #tokens = tokenizer.tokenize(question)
    
    return tokens


In [11]:
def embed(tokens):
    """ Embed tokenized sentence to one vector """
    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    
    # Convert custom tokens into a string for BERT special format
    custom_text = ' '.join(tokens)
    
    # Tokenize this text using BERT's tokenizer
    bert_inputs = bert_tokenizer(custom_text, return_tensors="pt", padding=True, truncation=True)
    
    # Pass the tokenized inputs through the BERT model
    bert_model = BertModel.from_pretrained("bert-base-uncased")
    with torch.no_grad():
        bert_outputs = bert_model(**bert_inputs)
    
    # Extracting embeddings
    embeddings = bert_outputs.last_hidden_state
    embedded_question = embeddings.mean(dim=1)[0]  # single vector representing the entire question.
    return embedded_question.detach().numpy()
    

In [105]:
def calculate_distances_matrix(v1, v2):
   
    """ Calculate cosine distance between a vector v1 and each vector in df_v2 """
    # Compute the norms of each row vector
    v1_norm = np.sqrt(np.sum(v1 ** 2, axis=1)).reshape(-1, 1)
    v2_norm = np.sqrt(np.sum(v2 ** 2, axis=1))

    # Compute the dot product between v1 and the transpose of v2
    dot_product = np.dot(v1, v2.T)

    # Compute the cosine distances
    cosine_distances = dot_product / (v1_norm * v2_norm)
   

    return cosine_distances

    

In [42]:
def tokenize_embed(x):
    return embed(tokenize(x))

In [100]:
def return_x_matches(df, question_index, table_of_distances, x=1):
    list_of_distances = table_of_distances.loc[question_index].sort_values().head(x)  
    indices = list_of_distances.index
    matches = df.loc[indices, question_2].astype(str)
    print('Labels of matches: ', df.loc[indices, 'label'].values)
    return matches    

In [110]:
pd.set_option('display.max_colwidth', None)

In [111]:
# Test out on a slice of the original dataframe
SLICE_SIZE = 5

embedded_Q1_df=pd.DataFrame(df.head(SLICE_SIZE)[question_1].apply(lambda x: tokenize_embed(str(x))))
embedded_Q2_df=pd.DataFrame(df.head(SLICE_SIZE)[question_2].apply(lambda x: tokenize_embed(str(x))))

v1 = np.vstack(embedded_Q1_df.values)
v2 = np.vstack(embedded_Q2_df.values)

print(embedded_Q1_df.head(3))
print('STACKED')
print(v1[0:2])

# DataFrame of 1D DataFrames : each entry is a vector of distances of Q1_i question from Q2 questions

distances_matrix = calculate_distances_matrix(v1, v2)

distances_Q1_Q2_df = pd.DataFrame(distances_matrix, index = embedded_Q1_df.index, columns = embedded_Q2_df.index)

distances_Q1_Q2_df.head()



                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

TypeError: loop of ufunc does not support argument 0 of type numpy.ndarray which has no callable sqrt method

In [104]:
index = 3
question1 = df.loc[index, question_1]
matches = return_x_matches(df, index, distances_Q1_Q2_df, 5)
print(f'Matches for "{question1}" are:')
print(matches.to_string(index=False))

Labels of matches:  [0 0 0 1 0]
Matches for "Am I over weight (192.9) for my age (39)?" are:
                   Can CBT be used to treat anxiety?
           What are the side effects of Doxycycline?
What are the side effects of the treatment for ADHD?
    What are the symptoms of depression and anxiety?
         Which is the best antibiotic for sinusitis?
