In [41]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [20]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")

#Dataset Loading And Preparation

In [21]:
from datasets import load_dataset
import pandas as pd

# Load the healthcare conversations dataset
healthcare_conversations_dataset = load_dataset(
    "lavita/ChatDoctor-HealthCareMagic-100k", streaming=True, split="train"
)

healthcare_conversations_dataset = healthcare_conversations_dataset.take(1000)

In [22]:
# Load Drug Review dataset
drug_reviews_dataset = load_dataset(
    "Reboot87/drugs_reviews_dataset", streaming=True, split="train"
)

drug_reviews_dataset = drug_reviews_dataset.take(1000)

In [23]:
# Convert to pandas DataFrame
healthcare_conversations_dataset = pd.DataFrame(healthcare_conversations_dataset)
drug_reviews_dataset = pd.DataFrame(drug_reviews_dataset)

In [24]:
healthcare_conversation_dataset = healthcare_conversations_dataset.drop(
 columns=["instruction"]
)

In [25]:
drug_reviews_dataset = drug_reviews_dataset.drop(
 columns=["patientId", "date", "usefulCount", "review_length"]
)

In [26]:
healthcare_conversation_dataset.head()

Unnamed: 0,input,output
0,I woke up this morning feeling the whole room ...,"Hi, Thank you for posting your query. The most..."
1,My baby has been pooing 5-6 times a day for a ...,Hi... Thank you for consulting in Chat Doctor....
2,"Hello, My husband is taking Oxycodone due to a...","Hello, and I hope I can help you today.First, ..."
3,lump under left nipple and stomach pain (male)...,HI. You have two different problems. The lump ...
4,I have a 5 month old baby who is very congeste...,Thank you for using Chat Doctor. I would sugge...


In [27]:
drug_reviews_dataset.head()

Unnamed: 0,drugName,condition,review,rating
0,cyclosporine,keratoconjunctivitis sicca,"""I have used Restasis for about a year now and...",2
1,etonogestrel,birth control,"""My experience has been somewhat mixed. I have...",7
2,implanon,birth control,"""This is my second Implanon would not recommen...",1
3,hydroxyzine,anxiety,"""I recommend taking as prescribed, and the bot...",10
4,dalfampridine,multiple sclerosis,"""I have been on Ampyra for 5 days and have bee...",9


#HuggingFace Embedding generation

In [31]:
from langchain_google_genai import  GoogleGenerativeAIEmbeddings

def get_embedding(text, task_prefix="document"):
    """
    Generate embeddings for a text string with a task-specific prefix using th
    Parameters:
    text (str): The input text to be embedded.
    task_prefix (str): A prefix describing the task; this is prepended to 
    Returns:
    list: The generated embeddings as a list of floats.
    """
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    )
    return embedding_model.embed_query(text)


In [32]:
def generate_embedding_for_healthcare_dataset(row):
    """
    Generate an embedding for a conversation by concatenating the patient's input
    and the medical practitioner's response.
    Parameters:
    row (pd.Series): A row from the healthcare conversation dataset containing
    - 'input': The patient's message.
    - 'output': The practitioner's response.
    Returns:
    embedding: The embedding vector generated from the concatenated conversation.
    """
    # Concatenate the input and output with descriptive text.
    conversation_text = (
    f"This is the input from the patient: {row['input']}. "
    f"This is the response from the medical practitioner: {row['output']}"
    )
    # Generate and return the embedding using the get_embedding function.
    return get_embedding(conversation_text)

In [33]:
from tqdm import tqdm

tqdm.pandas(desc="Generating healthcare embeddings")
healthcare_conversation_dataset["embedding"] = healthcare_conversation_dataset.progress_apply(
    generate_embedding_for_healthcare_dataset, axis=1
)

Generating healthcare embeddings: 100%|██████████| 1000/1000 [13:36<00:00,  1.22it/s]


In [35]:
len(healthcare_conversation_dataset["embedding"][0])

768

In [36]:
healthcare_conversation_dataset.head()

Unnamed: 0,input,output,embedding
0,I woke up this morning feeling the whole room ...,"Hi, Thank you for posting your query. The most...","[0.0020050371531397104, -0.05553682520985603, ..."
1,My baby has been pooing 5-6 times a day for a ...,Hi... Thank you for consulting in Chat Doctor....,"[-0.003632515901699662, -0.011232027783989906,..."
2,"Hello, My husband is taking Oxycodone due to a...","Hello, and I hope I can help you today.First, ...","[-0.03484662249684334, -0.014810346066951752, ..."
3,lump under left nipple and stomach pain (male)...,HI. You have two different problems. The lump ...,"[0.004324246663600206, -0.006830697879195213, ..."
4,I have a 5 month old baby who is very congeste...,Thank you for using Chat Doctor. I would sugge...,"[0.008385895751416683, -0.05430954694747925, -..."


In [37]:
drug_reviews_dataset["embedding"] = drug_reviews_dataset["review"].progress_apply(
    get_embedding
)

Generating healthcare embeddings: 100%|██████████| 1000/1000 [12:38<00:00,  1.32it/s]


In [38]:
drug_reviews_dataset.head()

Unnamed: 0,drugName,condition,review,rating,embedding
0,cyclosporine,keratoconjunctivitis sicca,"""I have used Restasis for about a year now and...",2,"[0.057198747992515564, -0.03319098800420761, -..."
1,etonogestrel,birth control,"""My experience has been somewhat mixed. I have...",7,"[-0.007726987358182669, -0.022331252694129944,..."
2,implanon,birth control,"""This is my second Implanon would not recommen...",1,"[0.005060719326138496, -0.060500070452690125, ..."
3,hydroxyzine,anxiety,"""I recommend taking as prescribed, and the bot...",10,"[0.018926484510302544, -0.03552955016493797, -..."
4,dalfampridine,multiple sclerosis,"""I have been on Ampyra for 5 days and have bee...",9,"[0.03307104483246803, 0.04259384423494339, -0...."


#MongoDB Creation

In [81]:
os.environ["MONGODB_URI"] = os.getenv("MONGODB_URI")
os.environ["MONGODB_DATABASE"] = os.getenv("MONGODB_DATABASE")

In [55]:
import nest_asyncio
nest_asyncio.apply()
import asyncio

In [56]:
import pymongo

async def get_mongo_client(mongo_uri=None):
   """Establish a connection to the MongoDB database."""
   client = await pymongo.AsyncMongoClient(
      mongo_uri, app_name="Agentic RAG Medical Bot"
    )
   ping_result = client.admin.command("ping")
   if ping_result.get("ok") == 1.0:
        return "Connection to MongoDB is Successful"
        return client
   else:
      return "Connection to MongoDB Failed"
   return None

In [79]:
import getpass
from urllib.parse import quote_plus

#username_safe = quote_plus(getpass.getpass("USER_NAME: "))
password_safe = quote_plus(getpass.getpass("PASSWORD: "))

In [82]:
MONGODB_URI = os.environ.get("MONGODB_URI")
MONGODB_URI

'mongodb+srv://Gayaz2000:Gayaz@2000@cluster0.mongodb.net/test?retryWrites=true&w=majority'

In [77]:
from pymongo.errors import CollectionInvalid

mongo_client = asyncio.run(get_mongo_client(MONGODB_URI))
mongo_client

InvalidURI: Username and password must be escaped according to RFC 3986, use urllib.parse.quote_plus

In [50]:
DB_NAME = "virtual_primary_care_assistant"
DRUG_REVIEW_COLLECTION_NAME = "drug_reviews"
CONVERSATION_COLLECTION_NAME = "conversations"

db = mongo_client[DB_NAME]
db

TypeError: 'coroutine' object is not subscriptable