In [None]:
!pip install datasets nltk transformers torch PyTorch scipy tabulate

clear_output()

In [1]:
from IPython.display import clear_output
import numpy as np
import pandas as pd
from tabulate import tabulate
import nltk
from nltk.tokenize import RegexpTokenizer
import warnings
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import BertTokenizer, BertModel
import torch
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity

# Suppress all warnings in cells printouts for clear output
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Load dataset

In [2]:
dataset = load_dataset("medical_questions_pairs")

for split in dataset:
    print(split)



train


# Parse and clean dataset


In [3]:
df = pd.DataFrame(dataset['train'])
df.head()

Unnamed: 0,dr_id,question_1,question_2,label
0,1,After how many hour from drinking an antibioti...,I have a party tonight and I took my last dose...,1
1,1,After how many hour from drinking an antibioti...,I vomited this morning and I am not sure if it...,0
2,1,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 19...,1
3,1,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or v...,0
4,1,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache wi...,0


In [4]:
question_1 = 'question_1'
question_2 = 'question_2' # column names as vars for convenienct

In [5]:
df.drop('dr_id', axis=1, inplace=True)  # keep only question pairs and labels. Label 1 means match

In [6]:
df.shape

(3048, 3)

Properties of a good set:
1. No paired duplicates (no rows with same q1 and q2 values as a pair). Ensure removal of inverse duplication:
   q1,q2 and q2,q1
2. Its ok to have duplicates in question_1 as they can have multiple matches
3. No NaN
4. Many to many relation is possible

In [7]:
# Check for NaN and remove
nan_cols = df.isnull().any()
print(nan_cols)

question_1    False
question_2    False
label         False
dtype: bool


In [8]:
# Remove paired duplicates 
df['isometric_pair'] = df.apply(lambda x: tuple(sorted([x[question_1], x[question_2]])), axis=1)

# Remove duplicates based on the normalized pairs
df = df.drop_duplicates(subset=['isometric_pair'])

# Drop the auxiliary column
df = df.drop(columns=['isometric_pair'])

In [9]:
df.shape  # see if something was removed

(3048, 3)

Outcome: clean dataset without pairs of duplicates

In [10]:
# TEST LABLES CORRECTNESS

matches_and_labels = df[df[question_1] == df.loc[100, question_1]]
print('question: ', df.loc[3, question_1])
print(matches_and_labels)

question:  Am I over weight (192.9) for my age (39)?
                                            question_1  \
100  What does a doctor do in externsl cronic anal ...   
101  What does a doctor do in externsl cronic anal ...   

                                            question_2  label  
100  I have bloody diarrhea for 2 days. Should I go...      0  
101  I think I have anal fissures. I have been havi...      1  


# Embedding: transform each question into vector

    1. Tokenize questions to words
    2. Embed words to vectors with BERT
    3. Make an average vectors to represent the sentences (or weighted average: like TF-IDF scores, giving more importance to certain words.)    
    4. Collect vectorized Q1 and Q2 questions sets into two dataframes
    5. Precalculate table of distances for each q1 to each q2
    6. For specific q1, get its distances vector from all Q2 and select 5 closest
    7. Based on indices, return the original Q2 questions


In [11]:
def tokenize(question):
    """ Break question sentence to word tokens without punctuation """
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(question) # split
    tokens = [token.lower() for token in tokens] # decapitalize
    
    return tokens


In [12]:
def embed(tokens):
    """ Embed tokenized sentence to one vector """
    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    
    # Convert custom tokens into a string for BERT special format
    custom_text = ' '.join(tokens)
    
    # Tokenize this text using BERT's tokenizer
    bert_inputs = bert_tokenizer(custom_text, return_tensors="pt", padding=True, truncation=True)
    
    # Pass the tokenized inputs through the BERT model
    bert_model = BertModel.from_pretrained("bert-base-uncased")
    with torch.no_grad():
        bert_outputs = bert_model(**bert_inputs)
    
    # Extracting embeddings
    embeddings = bert_outputs.last_hidden_state
    embedded_question = embeddings.mean(dim=1)[0]  # single vector representing the entire question.
    return embedded_question.detach().numpy()
    

In [13]:
def tokenize_embed(x):
    return embed(tokenize(x))

In [19]:
def return_x_matches_labels(df, question_index, table_of_distances, x):
    print(question_index)
    list_of_distances = table_of_distances.loc[question_index].sort_values(ignore_index=False, ascending=False) # max similarity
    print(f"Indices for question index {question_index}: {list_of_distances.index}")  # Debugging
    indices = list_of_distances.index    
    matches = df.loc[indices][question_2].head(x)
    matches_labels = {}
    matches_list = matches.to_list()
    matches_labels['matches'] = '\n'.join(matches_list)
    matches_labels['labels'] = df.loc[indices]['label'].head(x)
    return matches_labels

In [15]:
pd.set_option('display.max_colwidth', None)

In [22]:
def cosine_similarity_df(df1, df2):
    cosine_sim = cosine_similarity(df1, df2)
    cosine_sim_df = pd.DataFrame(cosine_sim, index=df1.index, columns=df2.index)
    return cosine_sim_df

In [17]:
# Test out on a slice of the original dataframe
SLICE_SIZE_q1 = 10
SLICE_SIZE_q2 = 50

In [23]:

# Since question 1 repeats, we cannot use the same slice size for question 2, it does not take all of the matches


embedded_Q1_df = pd.DataFrame(df.head(SLICE_SIZE_q1)[question_1].apply(tokenize_embed).tolist(), index=df.head(SLICE_SIZE_q1).index)
embedded_Q2_df = pd.DataFrame(df.head(SLICE_SIZE_q2)[question_2].apply(tokenize_embed).tolist(), index=df.head(SLICE_SIZE_q2).index)

distances_q1_q2 = cosine_similarity_df(embedded_Q1_df, embedded_Q2_df)

print(distances_q1_q2)




         0         1         2         3         4         5         6   \
0  0.722900  0.734532  0.686056  0.760159  0.822871  0.737057  0.812205   
1  0.722900  0.734532  0.686056  0.760159  0.822871  0.737057  0.812205   
2  0.580462  0.567667  0.806252  0.696317  0.627978  0.648982  0.692248   
3  0.580462  0.567667  0.806252  0.696317  0.627978  0.648982  0.692248   
4  0.718209  0.678154  0.582673  0.749557  0.774964  0.816553  0.768269   
5  0.718209  0.678154  0.582673  0.749557  0.774964  0.816553  0.768269   
6  0.731136  0.748978  0.676139  0.713212  0.731802  0.809167  0.801188   
7  0.731136  0.748978  0.676139  0.713212  0.731802  0.809167  0.801188   
8  0.588106  0.544555  0.619409  0.627632  0.544783  0.685619  0.656305   
9  0.588106  0.544555  0.619409  0.627632  0.544783  0.685619  0.656305   

         7         8         9   ...        40        41        42        43  \
0  0.717160  0.778693  0.796559  ...  0.749014  0.702434  0.704772  0.659949   
1  0.717160  0

In [27]:
question_index = 9
question1 = df.loc[question_index, question_1]
print(question1)
matches_labels = return_x_matches_labels(df, question_index, distances_q1_q2, 5)
print(matches_labels['matches'])
print(matches_labels['labels'])
accuracy_at_5 = matches_labels['labels'].sum() / len(matches_labels['labels'])
print('Accuracy@5: ', accuracy_at_5)

Been on antibiotics 4 5wks top high tooth dentist cld not get needle 2 freeze 2 extract in gum really hurt she said its the tissues hve 2 go bk? Plz
9
Indices for question index 9: Index([ 8, 21, 25,  7, 27, 13, 37, 19, 31,  5, 12, 46, 28, 36, 15, 42,  6, 29,
       39, 48, 26, 16, 30, 24, 35,  3,  9, 49,  2, 18, 20, 17, 34, 23,  0, 10,
       41, 47, 22, 32, 33, 11, 40,  4,  1, 38, 44, 43, 14, 45],
      dtype='int64')
I am unable to get tooth extraction, my dentist is unable to give me anesthesia. Wouldn't antibiotics help with this since it has been going on for about 4-5 weeks? Should I go to ER?
My sister sent me Dr. Reckeweg R53 Acne Vulgaris And Pimples Drop for my zits. Are you aware if this works?
My husband is a Type 2 Diabetic on Lantus Insulin and Actrapid (to be taken as per his blood sugar readings. About an hour back, he took his usual 30 units of Lantus and vomited. He was sweating as well. Should we be worried?
Today morning, I had an appointment with the doctor. After