In [1]:
import langchain
from langchain.document_loaders import TextLoader

In [1]:
OPENAI_API_KEY="OPENAI API KEY"

### Loading the document

In [3]:
loader = TextLoader('/home/manivarsh/Generative_AI/wellskyrag/MedicalReport.txt')

In [4]:
documents = loader.load()

In [5]:
documents

[Document(page_content='## Page 1\n\n**Section: Client Information**\nClient: MOUSE, MINNIE\nAddress: 21234 MAIN ST ANY CITY, USA 12345\nMR No:\nLegacy MR No:\nAdmission Date: 1/10/2022\n\n**Section: Insurance Information**\nInsured ID: 222222\nPrimary Payor: COMMERCIAL\n\n**Section: Physician Information**\nPhysician: JOHN DOE, MD\nAddress: 14444 MAIN ST ANY CITY USA, 12345\nPhone: (123)456-7890\n\n**Section: Diagnoses**\n| Order | Code | Description | Onset/Exac. | O/E Date | Type |\n|---|---|---|---|---|---|\n| 1 | 195.1 | ORTHOSTATIC HYPOTENSION | ONSET | 01/04/2022 | DIAGNOSIS |\n| 2 | R65.21 | SEVERE SEPSIS WITH SEPTIC SHOCK | EXACERBATION | 12/29/2021 | DIAGNOSIS |\n\n**Section: Allergies**\n* COW MILK\n* MIRIPIN\n*denotes Non-Visit Ql Reporting Collection\n\n**Section: Medications**\n| Start Date / End Date | Classification | Medication | Dose | Amount | Frequency | Route | Purpose | Directions for use | Side Effects/ Interactions | New/Changed | Financial Resp. | Administered 

### Chunking the doc using Markdown splittrer

In [6]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [7]:
headers_to_split_on = [
    ("**Section: Client Information**", "Header 1"),
    ("**Section: Insurance Information**", "Header 2"),
    ("**Section: Diagnoses**", "Header 3"),
    ("**Section: Allergies**", "Header 4"),
    ("**Section: Medications**", "Header 5")
]

In [8]:
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [9]:
md_header_splits = markdown_splitter.split_text(documents[0].page_content)

In [10]:
md_header_splits

[Document(page_content='## Page 1'),
 Document(page_content='Client: MOUSE, MINNIE\nAddress: 21234 MAIN ST ANY CITY, USA 12345\nMR No:\nLegacy MR No:\nAdmission Date: 1/10/2022', metadata={'Header 1': ''}),
 Document(page_content='Insured ID: 222222\nPrimary Payor: COMMERCIAL  \n**Section: Physician Information**\nPhysician: JOHN DOE, MD\nAddress: 14444 MAIN ST ANY CITY USA, 12345\nPhone: (123)456-7890', metadata={'Header 2': ''}),
 Document(page_content='| Order | Code | Description | Onset/Exac. | O/E Date | Type |\n|---|---|---|---|---|---|\n| 1 | 195.1 | ORTHOSTATIC HYPOTENSION | ONSET | 01/04/2022 | DIAGNOSIS |\n| 2 | R65.21 | SEVERE SEPSIS WITH SEPTIC SHOCK | EXACERBATION | 12/29/2021 | DIAGNOSIS |', metadata={'Header 3': ''}),
 Document(page_content='* COW MILK\n* MIRIPIN\n*denotes Non-Visit Ql Reporting Collection', metadata={'Header 4': ''}),
 Document(page_content='| Start Date / End Date | Classification | Medication | Dose | Amount | Frequency | Route | Purpose | Directions

### Chunking with CharacterTestSplitter

In [11]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=30, chunk_overlap=10)
chunks = text_splitter.split_documents(documents)

Created a chunk of size 143, which is longer than the specified 30
Created a chunk of size 79, which is longer than the specified 30
Created a chunk of size 122, which is longer than the specified 30
Created a chunk of size 273, which is longer than the specified 30
Created a chunk of size 86, which is longer than the specified 30


### Embeddings

In [12]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
import weaviate
from weaviate.embedded import EmbeddedOptions

In [13]:
client = weaviate.Client(
  embedded_options = EmbeddedOptions()
)

vectorstore = Weaviate.from_documents(
    client = client,    
    documents = chunks,
    embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
    by_text = False
)

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            
{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-05-06T22:42:47+05:30"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-05-06T22:42:47+05:30"}


Started /home/manivarsh/.cache/weaviate-embedded: process ID 52038


{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-05-06T22:42:47+05:30"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50060","time":"2024-05-06T22:42:47+05:30"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-05-06T22:42:47+05:30"}
{"level":"info","msg":"Created shard langchain_7301eafe2316495eac3011aeb4fb41b2_vNP9kRDtXfyg in 1.08215ms","time":"2024-05-06T22:42:47+05:30"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-05-06T22:42:47+05:30","took":76461}
{"level":"info","msg":"Completed loading shard langchain_b1a294838c2a40c6997468572f2df2ca_Z0O5XYTDBGDJ in 8.036739ms","time":"2024-05-06T22:42:48+05:30"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"main","level":"info","lim

In [14]:
retriever = vectorstore.as_retriever()

In [15]:
from langchain.prompts import ChatPromptTemplate


In [16]:
template = """You are a medical assitant for question-answering tasks. 
Use the following piece of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Respond i am not aware, idf the information in not in priovded context.
Use three sentence maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""

In [17]:
prompt = ChatPromptTemplate.from_template(template)

In [18]:
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [19]:
llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

In [37]:
query = "I spateint taking Biotin?"
rag_chain.invoke(query)

'The patient is taking Biotin 1 mg capsule daily.'

In [38]:
retrived_doc = retriever.invoke(query)

In [39]:
retrived_doc[:2]

[Document(page_content='**Section: Medications**\n| Start Date / End Date | Classification | Medication | Dose | Amount | Frequency | Route | Purpose | Directions for use | Side Effects/ Interactions | New/Changed | Financial Resp. | Administered | Agency | PRN | Entered By | Date |\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n| 1/10/2022 | GENITOURINARY THERAPY | BETHANECHOL CHLORIDE 10 MG TABLET | 1 tablet | | 3 TIMES DAILY | ORAL | N | N | N | OTH | N | N | | N | NANCY NURSE, RN | 01/10/2022 |\n| 1/10/2022 | ELECTROLYTE BALANCE- NUTRITIONAL PRODUCTS | BIOTIN 1 MG CAPSULE | 1 capsule | | DAILY | ORAL | N | N | N | OTH | N | N | | N | NANCY NURSE, RN | 01/10/2022 |\n| 1/10/2022 | CARDIOVASCULAR THERAPY AGENTS | IRBESARTAN 150 MG TABLET | 0.5 tablet | | DAILY | ORAL | N | N | N | OTH | N | N | | N | NANCY NURSE, RN | 01/10/2022 |\n| 1/10/2022 | CARDIOVASCULAR THERAPY AGENTS | LABETALOL 100 MG TABLET | 1 tablet | | 3 TIMES DAILY | ORAL | N | N | N | OTH | N | 