In [1]:
%pwd

'/workspaces/codespaces-blank/research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'/workspaces/codespaces-blank'

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
def load_pdf_file(data):
    loader=DirectoryLoader(data,
                           glob="*.pdf",
                           loader_cls=PyPDFLoader)
    
    documents=loader.load()

    return documents

In [6]:
extracted_data=load_pdf_file('Data/')

In [7]:
len(extracted_data)

637

In [8]:
print(extracted_data[48])

page_content='Description
Actinomycosis is a relatively rare infection occurring
in one out of 300,000(1/300,000) people per year. It is
characterized by the presence of a lump or mass that often
forms, draining sinus tracts to the skin surface. Fifty per-
cent of actinomycosis cases are of the head and neck region
(also called “lumpy jaw” and “cervicofacial actinomyco-
sis”), 15% are in the chest, 20% are in the abdomen, and
the rest are in the pelvis, heart, and brain. Men are three
times more likely to develop actinomycosis than women.
Causes and symptoms
Actinomycosis is usually caused by the bacterium
Actinomyces israelii. This bacterium is normally present
in the mouth but can cause disease if it enters tissues fol-
lowing an injury. Actinomyces israelii is an anaerobic
bacterium which means it dislikes oxygen but grows very
well in deep tissues where oxygen levels are low. Tooth
extraction , tooth disease, root canal treatment ,j a w
surgery, or poor dental hygiene can allow Act

In [9]:
extracted_data_new=extracted_data[0:50]

In [10]:
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [11]:
text_chunks=text_split(extracted_data_new)
print("Length of Text Chunks: ", len(text_chunks))

Length of Text Chunks:  394


In [12]:
from dotenv import load_dotenv

load_dotenv()

True

In [13]:
os.environ["MISTRAL_API_KEY"] = os.getenv("MISTRAL_API_KEY")

In [14]:
from langchain_mistralai import MistralAIEmbeddings

embeddings = MistralAIEmbeddings(
    model="mistral-embed",

)

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
single_vector = embeddings.embed_query("hi")
print(str(single_vector)[:100])

[-0.02178955078125, -0.02880859375, 0.018035888671875, 0.006084442138671875, 0.039947509765625, 0.05


In [16]:
vector_length=len(single_vector)

In [17]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [18]:
index_name = "medical"

pc.create_index(
    name=index_name,
    dimension=vector_length, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

{
    "name": "medical",
    "metric": "cosine",
    "host": "medical-2rz7ocg.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1024,
    "deletion_protection": "disabled",
    "tags": null
}

In [19]:
from langchain_pinecone import PineconeVectorStore

docsearch=PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [20]:
retriever=docsearch.as_retriever(search_type="similarity",search_kwargs={"k":3})

In [21]:
retrieved_docs=retriever.invoke("What is acne?")

print(retrieved_docs)

[Document(id='691e51ff-3a65-4c58-b871-ba6b4110612f', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 37.0, 'page_label': '38', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data/Medical_book.pdf', 'total_pages': 637.0}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is\nthe most common skin disease. It affects nearly 17 million\npeople in the United States. While acne can arise at any'), Document(id='cb52e9fd-2fde-4bc2-90de-790425bfd00d', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib

In [22]:
from langchain_mistralai import ChatMistralAI

llm = ChatMistralAI(
    model="mistral-small-latest",
    temperature=0.6,
    max_retries=3
)

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt=(
    """
    Use the following pieces of retrieved content to assist in question answering task. If you don't know the answer, say that you don't know. Answer concisely.
    \n\n
    {context}
    """
)

prompt=ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        ("human","{input}")
    ]
)

In [24]:
question_answer_chain=create_stuff_documents_chain(llm, prompt)
rag_chain=create_retrieval_chain(retriever,question_answer_chain)

In [25]:
response=rag_chain.invoke({"input":"What is Acoustic Neuroma?"})
print(response['answer'])

An acoustic neuroma is a benign tumor involving cells of the myelin sheath that surrounds the vestibulocochlear nerve (eighth cranial nerve).


In [27]:
response2=rag_chain.invoke({"input":"What is llm?"})
print(response2['answer'])

I don't know.
