In [1]:
# Create embeddings using BERT
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("multi-qa-distilbert-cos-v1")

  from tqdm.autonotebook import tqdm, trange


In [15]:
import pandas as pd
doctor = pd.read_csv('data/processed/doctor.csv')

In [26]:
model.get_sentence_embedding_dimension()

768

In [16]:
doctor.head()

Unnamed: 0.1,Unnamed: 0,drug_id,drug_name,type,disease,adult_dosage (times/day),dosage (num_days),name,price_per_tablet,symptom
0,0,1,Amoxycillin,Penicillin,Bacterial Infections,2,7.0,Augmentin 625 Duo Tablet,22.342,"fever, cough"
1,1,2,Ibuprofen,NSAID,Pain Relief,3,5.0,Adiflam Plus 400 mg/500 mg Tablet,1.95,"inflammation, joint pain"
2,2,3,Metformin,Biguanide,Diabetes,2,30.0,Afoglip M 500 Tablet ER,17.053333,"high blood sugar, high insulin"
3,3,5,Cetirizine,Antihistamine,Allergy,1,3.0,Alerid Syrup,0.728,runny nose
4,4,6,Paracetamol,Analgesic,Fever,3,3.0,Acton-OR Tablet SR,5.055,headache


In [18]:
# meta data is added to review the strength of search results, to better gauge through the relevance of results
doctor['metadata'] = doctor.apply(lambda row: {
    "disease" : row['disease'],
    "symptoms" : row['symptom'],
    "treatment_drug" : row["drug_name"],
    "medicine_name": row['name'],
    "type_of_drug": row['type'],
    "daily_adult_dosage" : row['adult_dosage (times/day)'],
    "num_days" : row['dosage (num_days)'],
    "price/unit": row['price_per_tablet']
                }, axis = 1)

In [19]:
doctor['metadata'].head(2)

0    {'disease': 'Bacterial Infections', 'symptoms'...
1    {'disease': 'Pain Relief', 'symptoms': 'inflam...
Name: metadata, dtype: object

In [20]:
# to create weighted semantic search, we assign weights to the corresponding features
weight_symptom = 6
weight_disease = 4

def create_embeddings(row):
    # Encode individual components
    emb_symptom = model.encode(row['symptom'], show_progress_bar=False) * weight_symptom
    emb_disease = model.encode(row['disease'], show_progress_bar=False) * weight_disease
    # Combine embeddings by averaging them
    combined_embedding = (emb_symptom + emb_disease) / (weight_symptom + weight_disease)
    return combined_embedding

In [32]:
doctor['embedding'] = doctor.apply(create_embeddings, axis=1)

In [41]:
print(doctor['embedding'].apply(type))

0     <class 'numpy.ndarray'>
1     <class 'numpy.ndarray'>
2     <class 'numpy.ndarray'>
3     <class 'numpy.ndarray'>
4     <class 'numpy.ndarray'>
               ...           
81    <class 'numpy.ndarray'>
82    <class 'numpy.ndarray'>
83    <class 'numpy.ndarray'>
84    <class 'numpy.ndarray'>
85    <class 'numpy.ndarray'>
Name: embedding, Length: 86, dtype: object


In [29]:
doctor

Unnamed: 0,drug_id,drug_name,type,disease,adult_dosage (times/day),dosage (num_days),name,price_per_tablet,symptom,metadata,embedding
0,1,Amoxycillin,Penicillin,Bacterial Infections,2,7.0,Augmentin 625 Duo Tablet,22.342000,"fever, cough","{'disease': 'Bacterial Infections', 'symptoms'...","[0.04866177588701248, 0.040710460394620895, -0..."
1,2,Ibuprofen,NSAID,Pain Relief,3,5.0,Adiflam Plus 400 mg/500 mg Tablet,1.950000,"inflammation, joint pain","{'disease': 'Pain Relief', 'symptoms': 'inflam...","[-0.06187807396054268, 0.06776778399944305, -0..."
2,3,Metformin,Biguanide,Diabetes,2,30.0,Afoglip M 500 Tablet ER,17.053333,"high blood sugar, high insulin","{'disease': 'Diabetes', 'symptoms': 'high bloo...","[-0.04151008278131485, 0.011696846224367619, -..."
3,5,Cetirizine,Antihistamine,Allergy,1,3.0,Alerid Syrup,0.728000,runny nose,"{'disease': 'Allergy', 'symptoms': 'runny nose...","[-0.004205550067126751, 0.04382673650979996, 0..."
4,6,Paracetamol,Analgesic,Fever,3,3.0,Acton-OR Tablet SR,5.055000,headache,"{'disease': 'Fever', 'symptoms': 'headache', '...","[-0.03125862032175064, 0.08302132785320282, 0...."
...,...,...,...,...,...,...,...,...,...,...,...
81,96,Olmesartan,Angiotensin II Receptor Blocker,Hypertension,1,30.0,Asomex-OH Tablet,17.205000,high blood pressure,"{'disease': 'Hypertension', 'symptoms': 'high ...","[0.001667603151872754, 0.03967957943677902, -0..."
82,97,Colchicine,Anti-Gout Agent,Gout,1,7.0,Colochicine 0.5mg Tablet,1.058000,inflammation,"{'disease': 'Gout', 'symptoms': 'inflammation'...","[-0.09234916418790817, 0.028246885165572166, -..."
83,98,Hydroxyzine,Antihistamine,Anxiety,1,7.0,Atarax 25mg Tablet,5.700000,sedation,"{'disease': 'Anxiety', 'symptoms': 'sedation',...","[0.004015962593257427, 0.026121145114302635, -..."
84,99,Orlistat,Lipase Inhibitor,Obesity,1,30.0,AM Slim 120mg Capsule,45.000000,weight loss,"{'disease': 'Obesity', 'symptoms': 'weight los...","[-0.013387642800807953, 0.021259373053908348, ..."


In [30]:
doctor.to_csv('data/processed/embeddings.csv')

In [34]:
# connecting to pinecone
import os
import pinecone
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(),override=True)

True

In [35]:
pc = Pinecone( api_key = os.environ.get("PINECONE_API_KEY"), environment = os.environ.get("PINECONE_ENV"))

In [36]:
pc.delete_index('medi-sense-ai')

In [37]:
embedding_dimension = 768 #model.get_sentence_embedding_dimension()
# Create an index for storing embeddings
index_name = 'medi-sense-ai'
if index_name not in pc.list_indexes():
    pc.create_index(index_name,
                    dimension=embedding_dimension,
                    metric='cosine',
                    spec=ServerlessSpec(
                        cloud = "aws",
                        region = "us-east-1"
                    ))
index = pc.Index(index_name)

In [48]:
vectors_to_upsert = [(str(row['drug_id']), row['embedding'].tolist(), row['metadata']) for index,row in doctor.iterrows()]

In [49]:
index.upsert(vectors = vectors_to_upsert)
print("data upserted")

data upserted


In [54]:
# hit queries and test relevancies
query = 'fever, headache, cough, nausea, vomiting'
score_threshold = 0.3
query_embedding = model.encode(query, show_progress_bar=False).tolist()
query_results = index.query(
    vector = [query_embedding],
    top_k=2,
    include_metadata=True
)
for match in query_results['matches']:
    if match['score'] >= score_threshold:
        details = match.get('metadata', {})
        possible_disease = details.get('disease', 'N/A')
        medicine_name = details.get('medicine_name', 'N/A')
        treatment_drug = details.get('treatment_drug', 'N/A')
        type_of_drug = details.get('type_of_drug', 'N/A')
        symptoms = details.get('symptoms', 'N/A')
        daily_adult_dosage = details.get('daily_adult_dosage', 'N/A')
        num_days = details.get('num_days', 'N/A')
        price = details.get('price/unit', 'N/A')
        
        print(f"Matched item ID: {match['id']}, Score: {match['score']}")
        print(f"possible_disease: {possible_disease} \n treatment_drug: {treatment_drug} \n type_of_drug: {type_of_drug} \n medicine_name: {medicine_name} \n symptoms: {symptoms} \n daily_adult_dosage: {daily_adult_dosage} \n num_days: {num_days} \n price: {price}",
              )

Matched item ID: 1, Score: 0.671745598
possible_disease: Bacterial Infections 
 treatment_drug: Amoxycillin 
 type_of_drug: Penicillin 
 medicine_name: Augmentin 625 Duo Tablet 
 symptoms: fever, cough 
 daily_adult_dosage: 2.0 
 num_days: 7.0 
 price: 22.342
Matched item ID: 20, Score: 0.614263475
possible_disease: Nausea 
 treatment_drug: Ondansetron 
 type_of_drug: Antiemetic 
 medicine_name: Anset 4mg Tablet MD 
 symptoms: vomiting 
 daily_adult_dosage: 1.0 
 num_days: 5.0 
 price: 4.9
