In [16]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA #from chat
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader,DirectoryLoader  #loads pdf
from langchain.text_splitter import RecursiveCharacterTextSplitter #convert intercorpus to chunks
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers #for using quantized model



In [5]:
PINECONE_API_KEY = "80523b22-9784-4392-85b5-bd46ddd90633"
PINECONE_API_ENV = "gcp-starter"

In [6]:
#extract data from pdf
def load_pdf(data):
    loader=DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    documents=loader.load()
    
    return documents

In [7]:
extracted_data=load_pdf("data/")

In [8]:
#Create text chunks fromm corpus
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [9]:
text_chunks=text_split(extracted_data)
print("length of my chunk: ",len(text_chunks))

length of my chunk:  2407


In [10]:
#downlaod embedding model
def download_hugging_face_embedding():
    embedding=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding

In [11]:
embeddings = download_hugging_face_embedding()

In [12]:
embeddings  

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [13]:
query_result = embeddings.embed_query("hello")
print(len(query_result))

384


In [17]:
#Initializing the Pinecone
pinecone.init(api_key=PINECONE_API_KEY,
              environment=PINECONE_API_ENV)


index_name="medchatbot"


#Creating Embeddings for Each of The Text Chunks & storing
docsearch=Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

In [18]:
 docsearch = Pinecone.from_existing_index(index_name,embeddings)
 query = "what is asthma"
 docs=docsearch.similarity_search(query,k=3)
 print(docs)

[Document(page_content='■Pearl\nAll that wheezes is not asthma; remember conditions like heart failureand vocal cord dysfunction in patients with “steroid-resistant” asthma.\nReferenceNi Chroinin M, Greenstone I, Lasserson TJ, Ducharme FM. Addition of inhaled\nlong-acting beta2-agonists to inhaled steroids as ﬁrst line therapy for persist-ent asthma in steroid-naive adults and children. Cochrane Database Syst Rev2009;4:CD005307. [PMID: 19821344]', metadata={}), Document(page_content='•Treatment of exacerbations: Oxygen, inhaled bronchodilators ( β2-\nagonists > anticholinergics), systemic corticosteroids (5 days)\n•Leukotriene modiﬁers (eg, montelukast) may provide a secondoption for long-term therapy in mild to moderate disease\n•Nedocromil/cromolyn is effective for exercise-induced asthma\n•For difﬁcult-to-control asthma, consider exacerbating factors suchas gastroesophageal reﬂux disease and chronic sinusitis\n■Pearl', metadata={}), Document(page_content='Chapter 2 Pulmonary Disease

In [19]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""