In [3]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [4]:
from dotenv import load_dotenv
import os

In [5]:
load_dotenv()

True

In [6]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV')

In [41]:
PINECONE_API_ENV

'us-east-1'

## Load pdf

In [42]:
def load_pdf(data):
    loader = DirectoryLoader(
        data,
        glob='*.pdf',
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

In [43]:
extracted_data = load_pdf('../data/')

In [44]:
# extracted_data

In [45]:
len(extracted_data[1].page_content)

46

## Create text chunks

In [46]:
# Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [47]:
text_chunks = text_split(extracted_data)
print("length of my chunk: ", len(text_chunks))

length of my chunk:  7486


In [48]:
# text_chunks

In [49]:
len(text_chunks[7].page_content)

136

## Download embedding model

In [10]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [11]:
embeddings = download_hugging_face_embeddings()



In [12]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [54]:
query_result = embeddings.embed_query('Hello world')
print('Length', len(query_result))

Length 384


In [60]:
# query_result

## Initializaing the Pinecone

In [7]:
pinecone.Pinecone(api_key=PINECONE_API_KEY)

<pinecone.control.pinecone.Pinecone at 0x7f010fb91760>

In [8]:
index_name = 'medical-chatbot'

In [69]:
# Creating embeddings from each of the text chunks & storing
docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

In [13]:
vector_db = Pinecone.from_existing_index(index_name, embeddings)

In [72]:
vector_db

<langchain_community.vectorstores.pinecone.Pinecone at 0x7fede83f4530>

In [73]:
docsearch

<langchain_community.vectorstores.pinecone.Pinecone at 0x7feeb5f41970>

In [75]:
query = 'What are Allergies'

docs = vector_db.similarity_search(query, k=3)
print(docs)

[Document(page_content='by harmless, everyday substancessuch as pollen, dust, and animal danders. When thisoccurs, an allergy develops against the offending sub-stance (an allergen.)'), Document(page_content='Richard Robinson\nAllergies\nDefinition\nAllergies are abnormal reactions of the immune sys-\ntem that occur in response to otherwise harmless sub-stances.\nGALE ENCYCLOPEDIA OF MEDICINE 2 114AllergiesGEM - 0001 to 0432 - A  10/22/03 1:42 PM  Page 114'), Document(page_content='Mygund and R. M. Naclerio. Philadelphia: W. B. Saun-ders Co., 1993.\nLawlor, G. J. Jr., T. J. Fischer, and D. C. Adelman. Manual of\nAllergy and Immunology. Boston: Little, Brown and Co.,\n1995.\nNovick, N. L. You Can Do Something About Your Allergies.\nNew York: Macmillan, 1994.\nWeil, A. Natural Health, Natural Medicine: A Comprehensive\nManual for Wellness and Self-Care. New York: Houghton\nMifflin, 1995.\nRichard Robinson\nAllergies\nDefinition\nAllergies are abnormal reactions of the immune sys-')]


## LLM

In [14]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [15]:
PROMPT = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
chain_type_kwargs = {'prompt': PROMPT}

In [16]:
llm = CTransformers(model='../model/llama-2-7b-chat.ggmlv3.q4_0.bin', 
                    model_type='llama',
                    config={'max_new_tokens': 512,
                            'temperature': 0.8})

In [17]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=vector_db.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [18]:
while True:
    user_input = input(f"Input Prompt: ")
    result = qa({'query': user_input})
    print('Response: ', result['result'])

  warn_deprecated(
