In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import DirectoryLoader,PyPDFLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from langchain_core.runnables import RunnableParallel,RunnablePassthrough,RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import HumanMessage
from langchain.retrievers import EnsembleRetriever,ContextualCompressionRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_cohere import CohereRerank
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

### Model Loading

In [3]:

callback = [StreamingStdOutCallbackHandler()]

In [4]:
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
model = ChatGroq(model="llama-3.1-8b-instant",streaming=True,callbacks=callback)

In [5]:
model.invoke("pakistan 5 fact")

Here are five interesting facts about Pakistan:

1. **Strategic Location**: Pakistan is located at the crossroads of Asia, bordering four countries: India to the east, Afghanistan to the west, Iran to the southwest, and China to the north. Its unique location makes it a vital trade and transportation hub between Central Asia and South Asia.

2. **Mountainous Terrain**: Pakistan is home to some of the world's highest and most rugged mountains, including the Karakoram and Himalayan ranges. K2, the second-highest mountain in the world, is located in the Karakoram range in Pakistan. These mountains have created several valleys and mountain passes, which are home to a diverse range of flora and fauna.

3. **Cultural Heritage**: Pakistan has a rich cultural heritage, with a history dating back over 4,000 years. The country is home to several UNESCO World Heritage Sites, including the ancient cities of Mohenjo-Daro and Harappa, which were once part of the Indus Valley Civilization. Pakistan i

AIMessage(content="Here are five interesting facts about Pakistan:\n\n1. **Strategic Location**: Pakistan is located at the crossroads of Asia, bordering four countries: India to the east, Afghanistan to the west, Iran to the southwest, and China to the north. Its unique location makes it a vital trade and transportation hub between Central Asia and South Asia.\n\n2. **Mountainous Terrain**: Pakistan is home to some of the world's highest and most rugged mountains, including the Karakoram and Himalayan ranges. K2, the second-highest mountain in the world, is located in the Karakoram range in Pakistan. These mountains have created several valleys and mountain passes, which are home to a diverse range of flora and fauna.\n\n3. **Cultural Heritage**: Pakistan has a rich cultural heritage, with a history dating back over 4,000 years. The country is home to several UNESCO World Heritage Sites, including the ancient cities of Mohenjo-Daro and Harappa, which were once part of the Indus Valley

### Streaming Live Result

In [None]:
for chunk in model.stream("tell me about Pakistan in 7 lines"):
    print(chunk.content, end="")

PakistanPakistan, officially the, officially the Islamic Republic of Pakistan, is a country in South Asia. It's the fifth Islamic Republic of Pakistan, is a country in South Asia. It's the fifth-most populous country globally and has a diverse landscape, including deserts, mountains, and-most populous country globally and has a diverse landscape, including deserts, mountains, and fertile plains.  Its history is intertwined with the partition of India in 1947, creating a separate nation for Muslims.  Islam is the state religion fertile plains.  Its history is intertwined with the partition of India in 1947, creating a separate nation for Muslims.  Islam is the state religion, and the country has a rich cultural heritage influenced by Persian, Central Asian, and Indian traditions.  Pakistan faces ongoing challenges related to political instability, economic development, and the country has a rich cultural heritage influenced by Persian, Central Asian, and Indian traditions.  Pakistan fac

### Document Loading

In [6]:

def doc_load(path):
    return DirectoryLoader(path,glob="*.pdf",loader_cls=PyPDFLoader).load()


path_to_dentist = "data/Dentist"
path_to_biology = "data/school(9th,10th,11th)"

dentist_document = doc_load(path_to_dentist)
biology_document = doc_load(path_to_biology)

print("len of biology ",len(biology_document))
print("len of dentist ",len(dentist_document))

len of biology  565
len of dentist  1449


### Text Chunking

In [7]:
def text_splitter(doc):
    return RecursiveCharacterTextSplitter(chunk_size=700,chunk_overlap=140).split_documents(doc)

biology_chunks = text_splitter(biology_document)
dentist_chunks = text_splitter(dentist_document)

print("len of biology chunnks:",len(dentist_chunks))
print("len of dentist chunnks:",len(biology_chunks))

len of biology chunnks: 8621
len of dentist chunnks: 1040


### Creating or Loading Vector store Index(Pinecone)

In [10]:
from pinecone import Pinecone,ServerlessSpec
import os

api = os.getenv("PINECONE_API_KEY")
env = os.getenv("PINECONE_ENV")

pc = Pinecone(api_key=api)

if "multimodal-rag" not in pc.list_indexes().names():
    pc.create_index(
        dimension=768,
        name="multimodal-rag",
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
print("connected to pinecone database")

connected to pinecone database


### Vector Store

In [None]:
from tqdm import tqdm     # tqdm is a progress bar library that helps you visualize loops.

batch_size=50
for i in tqdm(range(0,len(biology_chunks),batch_size)):       # as size of chunks is larger than 4mb we embedd the data in batches
    batch = biology_chunks[i:i+batch_size]
    Biology_vector_store = PineconeVectorStore.from_documents(
        documents=batch,
        embedding=embedding_model,
        index_name="multimodal-rag"
    )


for i in tqdm(range(0,len(dentist_chunks),batch_size)):       
    batch = dentist_chunks[i:i+batch_size]
    dentist_vector_store = PineconeVectorStore.from_documents(
        documents=batch,
        embedding=embedding_model,
        index_name="multimodal-rag"
    )

100%|██████████| 42/42 [06:41<00:00,  9.57s/it]


In [19]:
vector_store = PineconeVectorStore(
    index_name="multimodal-rag",
    embedding=embedding_model,
)

### Retriever

### Sparse Retriever

In [69]:
bm25_retriever = BM25Retriever.from_documents(chunks)
bm25_retriever.k=3

### Dense Retriever

In [71]:
dense_retriever = vector_store.as_retriever(search_kwargs={"k":3})

In [72]:
hybrid_retrieval = EnsembleRetriever(
    retrievers=[bm25_retriever,dense_retriever],
    weights=[0.5,0.5]
)

In [74]:
reranker = CohereRerank(model = "rerank-english-v3.0")

In [77]:
final_retriever = ContextualCompressionRetriever(
    base_retriever=hybrid_retrieval,
    base_compressor=reranker
)

In [21]:
parser = StrOutputParser()
prompt_text = PromptTemplate(
    template="""You are a highly accurate medical assistant.
        Use ONLY the given context to answer the user's question.
        If the context does not contain the information needed, simply reply:
        "I don't know based on the given context."
        CONTEXT:
        {context}
        QUESTION:
        {question}
        Your Answer:""",
input_variables=["context", "question"])

In [None]:
# model_image = ChatGoogleGenerativeAI(model = "gemini-1.5-flash")

model_image = ChatGroq(model=)


### Text Generation Chain

In [86]:

def context_format(context):
    return "\n\n".join(doc.page_content for doc in context)

parallel_chain = RunnableParallel({
      "context": final_retriever | RunnableLambda(context_format),
      "question":RunnablePassthrough()
  })

text_chain = parallel_chain | prompt_text | model | parser

In [87]:
text_chain.invoke("summary of the text in 7 lines")

Here's a summary of the text in 7 lines:

The patient report is used to seek medical help with personal and health information.
It includes the patient's name, age, and location.
The report describes the main sickness or problem, its duration, and onset.
Normal conditions mentioned include mask of pregnancy and menstrual period.
Abnormal conditions include mastitis (breast abscess) and menopause.
The report also mentions micro-organisms as part of general health information.
It's published by Hesperian Health Guides for health guides.

"Here's a summary of the text in 7 lines:\n\nThe patient report is used to seek medical help with personal and health information.\nIt includes the patient's name, age, and location.\nThe report describes the main sickness or problem, its duration, and onset.\nNormal conditions mentioned include mask of pregnancy and menstrual period.\nAbnormal conditions include mastitis (breast abscess) and menopause.\nThe report also mentions micro-organisms as part of general health information.\nIt's published by Hesperian Health Guides for health guides."

# Vision Model

### Loading Image From Link

In [None]:
import requests
import os
from PIL import Image   # pillow

def get_image(url,file_name,extension):
    os.makedirs("content",exist_ok=True)
    content = requests.get(url).content

    #save image to gile
    file_path = f"content/{file_name}.{extension}"
    with open(file_path,"wb") as f:
        f.write(content)

    image = Image.open(file_path)
    image.show()
    return image

image_url = "https://earthshotprize.org/wp-content/uploads/2023/05/bee-on-flower.jpg"
pil_image = get_image(image_url,"cat","png")

### Converting Image to text to feed in to Model

In [None]:
# Base64 is a way of encoding binary data (like images, files, or videos) into a text format using only ASCII characters (letters, numbers, and a few symbols)
# Base64 converts an image (or any file) into text.

import base64
from io import BytesIO

# model except image in base64 format
def extract_text_from_image(pil_image,format="png"):
    buffered = BytesIO()
    pil_image.save(buffered,format=format)
    img_base64 = base64.b64encode(buffered.getvalue()).decode()
    return f"data:image/{format.lower()};base64,{img_base64}"  # data : It tells the browser or application that the content which follows is not a normal URL, but actual data

pil = Image.open("images/2.png")
image = extract_text_from_image(pil)



In [None]:
message = HumanMessage(content=[{
    "type":"text",
    "text":"Explain the image based on  context only"},
    {
        "type":"image_url",
        "image_url":image
    }
    ])


img_chain = RunnablePassthrough() | model_image | parser | text_chain

In [57]:
img_chain.invoke([message])

'WRAP THE BABY WELL. LEA VE HIM NAKED.\nBUT IN HOT WEATHER (OR \nWHEN THE BABY HAS A FEVER)'

## Record Audio

In [None]:
import speech_recogination as sr
from pydub import AudioSegment
from io import BytesIO
import logging

SyntaxError: invalid syntax (2850803702.py, line 1)

In [None]:
logging.basicConfig(level=logging.info,format = "{ascitime}s-{levelname}s-{message}s")

In [None]:
def record_audio(save_path):
    recognizer = sr.Recognizer()
    try:
        with sr.microphone as source:
            logging.info("adjusting ambient noise....")
            recognizer.adjust_ambient_noise(duration=1,source)
            logging.info("Start speaking now....")

            audio = recognizer.listen(source=source,timeout=15)
            logging.info("Recording completed")

            wav_data = audio.get_wav_data()
            audio_segment = AudioSegment.from_file(BytesIO(wav_data),format="wav")

            audio_segment.export(save_path,format="mp3",bitrate="128k")
            logging.info(f"Audio File saved to {save_path}")
    except Exception as e:
        logging.error(f"Error : {e}")


save_path = os.makedirs("Audio/Input_audio/user_input.mp3",exist_ok=True)
record_audio(save_path=save_path)

## Voice to Text

In [None]:
client = Groq()
model_stt = "whisper-large-v3-turbo"
audio_file = open(save_path,"rb")

transcription = client.audio.transcription.creat(
    model = model_stt,
    file = audio_file,
    language = "en"
)

print(transcription.text)
