In [None]:
! pip install pypdf

In [21]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS, Chroma
from langchain.chains import RetrievalQA, LLMChain

from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import ChatPromptTemplate

import pathlib
import os
import textwrap
from dotenv import load_dotenv
from IPython.display import display
from IPython.display import Markdown


In [5]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [6]:
# import documents
loader = PyPDFDirectoryLoader("data")
docs = loader.load()

In [7]:
# document handling: text splitting - embeddings - vector stores
# text splitting
text_splitter = RecursiveCharacterTextSplitter()
chunks = text_splitter.split_documents(docs)

# embeddings
# embeddings = GPT4AllEmbeddings()    # fast
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# vector stores
vectorstore = Chroma.from_documents(chunks, embeddings)

In [8]:
# retriever + example query

# example query
query = "What is a cause of heart disease?"
search = vectorstore.similarity_search(query)

to_markdown(search[0].page_content)

# retriever 
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
retriever.get_relevant_documents(query)

[Document(page_content='brain (includes stroke)\nRenal artery stenosis\nAortic aneurysm\nThere are also many cardiovascular diseases that involve the heart.\nCardiomyopathy – diseases of cardiac muscle\nHypertensive heart disease – diseases of the heart secondary to high blood\npressure or hypertension\nHeart failure - a clinical syndrome caused by the inability of the heart to supply\nsufficient blood to the tissues to meet their metabolic requirementsTypes', metadata={'page': 0, 'source': 'data\\Cardiovascular_disease.pdf'}),
 Document(page_content="Cardiovascular disease\nMicrograph of a heart with fibrosis\n(yellow) and amyloidosis (brown).\nMovat's stain.\nSpecialty Cardiology\nSymptoms Chest pain,\nshortness of breath,\nfatigue, loss of\nconsciousness\nComplicationsHeart failure, heart\nattack, stroke,\naneurysm,\nperipheral artery\ndisease, sudden\ncardiac arrest.[1]\nUsual onset Older adults[2]\nTypes Coronary artery\ndiseases, stroke,\nheart failure,\nhypertensive heart\ndisea

In [67]:
# LLM
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.2")

# mistralai/Mistral-7B-Instruct-v0.2    very nice

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\moolhuijsenns\.cache\huggingface\token
Login successful


In [68]:
# RAG chain

# template 
template = """
You are an AI assistant that follows instruction extremely well.
Please be truthful and give direct answers based on the context:
{context}


Question: {question}

"""
# prompt
prompt = ChatPromptTemplate.from_template(template)

# chain
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [69]:
response = rag_chain.invoke("what are the risk factors for heart disease?")
to_markdown(response)

>  Heart disease, also known as cardiovascular disease, has several risk factors. Here are the most common ones:
> 
>  1. High Blood Pressure: If your blood pressure is consistently high, it puts extra strain on your heart and blood vessels.
> 
>  2. High Cholesterol: Cholesterol builds up in and around your arteries, narrowing them and making it harder for blood to flow through.
> 
>  3. Smoking: Smoking damages the lining of your arteries, making them less flexible and more prone to blockages.
> 
>  4. Diabetes: Diabetes damages the blood vessels and the smaller blood vessels that a heart needs.
> 
>  5. Obesity and Overweight: Having too much body weight, especially if you carry it around your waist, can put extra stress on your heart.
> 
>  6. Physical Inactivity: Lack of regular physical activity can increase the risk of heart disease and stroke.
> 
>  7. Unhealthy Diet: A diet high in saturated fat, trans fat, sodium, and added sugars can contribute to heart disease.
> 
>  8. Family History: If heart disease runs in your family, you may be more likely to develop it.
> 
>  9. Age: The older you get, the higher your risk of heart disease.
> 
>  10. Stress: Chronic stress can damage your arteries and worsen other heart disease risk factors such as obesity and smoking.
> 
>  11. Excessive Alcohol: Drinking too much alcohol can lead to high blood pressure, obesity, and other conditions that increase the risk of heart disease.
> 
>  12. Sedentary Lifestyle: Spending most of your time sitting or lying down can increase your risk of heart disease.
> 
>  Remember, it's important to discuss any concerns or potential risk factors with your healthcare provider.