In [2]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain_text_splitters  import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#Extract text from pdf files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    
    documents = loader.load()
    return documents

In [4]:
extracted_data = load_pdf_files("data")

In [None]:
extracted_data

: 

In [12]:
#Split the documents in smaller chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20,
    )
    texts_chunk = text_splitter.split_documents(extracted_data)
    return texts_chunk

In [11]:
texts_chunk = text_split(extracted_data)
print(f"Number of chunks:  {len(texts_chunk)}")

NameError: name 'extracted_data' is not defined

In [8]:
texts_chunk

[Document(metadata={'producer': 'calibre (5.44.0) [http://calibre-ebook.com]', 'creator': 'calibre (5.44.0) [http://calibre-ebook.com]', 'creationdate': '2022-08-27T17:40:22+00:00', 'author': 'Edmund J. Bourne', 'keywords': 'anxiety relief;anxiety and phobia workbook;phobias', 'moddate': '2022-08-27T17:40:24+00:00', 'title': 'The Anxiety and Phobia Workbook', 'source': 'data\\The-Anxiety-and-Phobia-Workbook-Edmund-J.-Bourne.pdf', 'total_pages': 625, 'page': 1, 'page_label': '2'}, page_content='“Several generations of counselors and physicians have had the benefit of\nEdmund Bourne’s intensely researched and extremely handy The Anxiety and\nPhobia Workbook. I know few colleagues who do not have a copy on their\nbookshelves that they consult regularly when treating patients suffering with\nanxiety-related disorders. This book is especially important today as climate\nchange continues to inflame the anxieties of our already-stressed social fabric.'),
 Document(metadata={'producer': 'calib

In [9]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the huggingface embedding model."""
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name = model_name
        )
    return embeddings

embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(


In [10]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [11]:
vector = embedding.embed_query("Hello World")
vector

[-0.034477293491363525,
 0.031023185700178146,
 0.006734919268637896,
 0.026108955964446068,
 -0.03936200588941574,
 -0.16030244529247284,
 0.06692398339509964,
 -0.00644145580008626,
 -0.047450482845306396,
 0.014758873730897903,
 0.07087531685829163,
 0.05552761256694794,
 0.019193336367607117,
 -0.026251327246427536,
 -0.010109526105225086,
 -0.026940451934933662,
 0.02230745740234852,
 -0.02222668007016182,
 -0.14969263970851898,
 -0.017492998391389847,
 0.007676251698285341,
 0.05435226485133171,
 0.003254401497542858,
 0.031725890934467316,
 -0.08462139964103699,
 -0.029405971989035606,
 0.051595598459243774,
 0.04812406003475189,
 -0.003314854810014367,
 -0.05827920511364937,
 0.04196922481060028,
 0.022210687398910522,
 0.1281888335943222,
 -0.02233893983066082,
 -0.011656275019049644,
 0.06292839348316193,
 -0.032876357436180115,
 -0.0912260189652443,
 -0.03117534890770912,
 0.05269956961274147,
 0.04703487083315849,
 -0.08420306444168091,
 -0.030056191608309746,
 -0.020744830

In [12]:
print("Vector length : " , len(vector))

Vector length :  384


In [13]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [14]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")


os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

In [15]:
from pinecone import Pinecone
Pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=Pinecone_api_key)

In [16]:
pc

<pinecone.pinecone.Pinecone at 0x1efb2e66950>

In [17]:
from pinecone import ServerlessSpec
index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws",region="us-east-1")
    )
    
    
    index = pc.Index(index_name)

In [18]:
from langchain_pinecone import PineconeVectorStore

docsearch= PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [19]:
#Load Existing index

from langchain_pinecone import PineconeVectorStore
#Embed each chunk and upsert the embeddings into pinecone index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [20]:
retriever = docsearch.as_retriever(search_type = "similarity", search_kwargs={"k":3})

In [21]:
retrieved_docs = retriever.invoke("What is anxiety?")
retrieved_docs

[Document(id='a71d6681-6780-407d-9d5d-db2e42bed43c', metadata={'author': 'Edmund J. Bourne', 'creationdate': '2022-08-27T17:40:22+00:00', 'creator': 'calibre (5.44.0) [http://calibre-ebook.com]', 'keywords': 'anxiety relief;anxiety and phobia workbook;phobias', 'moddate': '2022-08-27T17:40:24+00:00', 'page': 15.0, 'page_label': '16', 'producer': 'calibre (5.44.0) [http://calibre-ebook.com]', 'source': 'data\\The-Anxiety-and-Phobia-Workbook-Edmund-J.-Bourne.pdf', 'title': 'The Anxiety and Phobia Workbook', 'total_pages': 625.0}, page_content='to experience a sense of stability or consistency in their lives. Anxiety disorders\nare simply one outcome of a diminished ability to cope with the resulting stress,\nas are addictive disorders, depression, the falling life expectancy in the US, and\nthe increased incidence of suicide among teenagers.\nMany good books on anxiety disorders have appeared during the past twenty\nyears. Most of these popular books tend to be primarily descriptive. Alt

In [27]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI
os.environ["GEMINI_API_KEY"]
chatModel = ChatGoogleGenerativeAI(model = "gemini-1.5-flash")
response = model.invoke("Hello")
print(response.content)

ModuleNotFoundError: No module named 'langchain_google_genai'

In [None]:
#Add more data to the existing pinecode index
dswith = Document(
    page_content = "dswithjanhvi is a youtube channel that provide info on various topics",
    metadata={"source" : "Youtube"}
)

NameError: name 'Document' is not defined

In [9]:
len(extracted_data)

625

In [None]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs:List[Document]) -> List[Document]:
    """"
    Given a list of document objects , return a new list of document objects containing only
    'source' in metadata and the original page content  """
    
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata = {"source" : src}
            )
        )
        return minimal_docs

In [29]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [30]:
minimal_docs

[Document(metadata={'source': 'data\\The-Anxiety-and-Phobia-Workbook-Edmund-J.-Bourne.pdf'}, page_content='')]

In [3]:
%pwd

'c:\\Users\\hp\\Documents\\Projects\\MentalHealthChatbot'

In [1]:
import os
os.chdir("../")

In [1]:
print("ok")

ok
