In [54]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
import pinecone
from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from transformers import AutoModel, AutoTokenizer
import torch
from langchain_pinecone import PineconeVectorStore

import warnings
warnings.filterwarnings('ignore')


In [29]:
pc = Pinecone(api_key="")

In [30]:
pc.create_index(
    name="mchatbot",
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [31]:
# Extract the data from document
def load_data(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    document = loader.load()
    return document

In [32]:
extracted_data = load_data("data/")

In [33]:
def text_splitter(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [34]:
text_chunk = text_splitter(extracted_data)
print("Length of chunks is: ", len(text_chunk))

Length of chunks is:  7020


In [35]:
def download_hugging_face_embedding():
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding

In [36]:
embedding = download_hugging_face_embedding()

In [37]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [38]:
query_result = embedding.embed_query("Hello world")
print("Length of embedding: ",len(query_result))

Length of embedding:  384


In [39]:
pc = Pinecone(api_key='d0b0a9cc-6736-4d68-adf5-5111ab4dad18', region="us-east-1")

In [40]:
# Load Hugging Face model

model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
index = pc.Index("mchatbot")

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()

# Create embeddings for each chunk
embeddings = [get_embedding(chunk.page_content) for chunk in text_chunk]

# Prepare data for upserting into Pinecone
pinecone_data = [
    {
        "id": str(i),
        "values": embedding,
        "metadata": {"text": chunk.page_content}
    }
    for i, (chunk, embedding) in enumerate(zip(text_chunk, embeddings))
]

In [41]:
# Upsert the embeddings into Pinecone
batch_size = 100  # Adjust the batch size as needed to fit within the 4 MB limit
for i in range(0, len(pinecone_data), batch_size):
    batch = pinecone_data[i:i + batch_size]
    index.upsert(vectors=batch)

In [47]:
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
docsearch = PineconeVectorStore.from_existing_index(index_name="mchatbot", embedding=embedding_model)

# Perform a similarity search
query_text = "What is acne"
docs = docsearch.similarity_search(query_text, k=3)

print(docs)

[Document(page_content='Acidosis seeRespiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when thepores of the skin become clogged with oil, dead skincells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is'), Document(page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25Acne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceousglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'), Document(page_content='ent purposes. For example, lotions, soaps, gels, andcreams containing benzoyl peroxide or tretinoin may beused to clear up mild to moderately severe acne.Isotretinoin (Accutane) is prescribed only for verysevere, disfiguring acne.\nAcne is a skin condition that occurs

In [48]:
print(docs[1].page_content)

GALE ENCYCLOPEDIA OF MEDICINE 2 25Acne
Acne vulgaris affecting a woman’s face. Acne is the general
name given to a skin disorder in which the sebaceousglands become inflamed. (Photograph by Biophoto Associ-
ates, Photo Researchers, Inc. Reproduced by permission.)GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25


In [49]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [50]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [51]:
llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [52]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

In [55]:
user_input=input(f"Input Prompt:")
result=qa.invoke({"query": user_input})
print("Response : ", result["result"])

Response :  Bursitis is a type of inflammatory condition that affects the bursae, which are small fluid-filled sacs located between moving parts of the body to reduce friction. It can be caused by repetitive motion or injury and can lead to pain, swelling, and limited mobility in the affected area.
