#### Importing Libraries

In [1]:
import os
import openai
import langchain
import pinecone 
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain_openai import OpenAIEmbeddings

  from tqdm.autonotebook import tqdm


In [2]:
# Reading the Dataset
def read_doc(directory):
    print(f"Files in directory: {len(os.listdir(directory))}")  # List all files
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

path = '../data/'
doc = read_doc(path)
print(f"Number of documents loaded: {len(doc)}")

Files in directory: 20
Number of documents loaded: 382


In [3]:
## Divide the docs into chunks
### https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html#
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return docs

documents=chunk_data(docs=doc)
len(documents)
# documents[381]

382

In [None]:
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

# Set your OpenAI API key
api_key = os.environ.get("OPENAI_API_KEY")


In [6]:
## Embedding Technique Of OPENAI
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
# embeddings

In [7]:
# Length of vector embedding
vectors=embeddings.embed_query("Hello, world")
len(vectors)

1536

In [8]:
# Loading the data chunks to the pinecone
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

pc = Pinecone(api_key="pcsk_6hKpf2_MvwLm4V9pu54N9nmDmUY8MFNuaHrP43JBwLD2FEEMPjC1GhKNVc5AUmhHzobyMP")

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)
index_name="medicalqabot"

print(pc.Index(index_name).describe_index_stats())


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [9]:
index = pc.Index(index_name)

for i, chunk in enumerate(documents):
    
    chunk_text = chunk.page_content
    metadata = chunk.metadata  
    # print(chunk_text, metadata)
    embedding = embeddings.embed_query(chunk_text)
    
    # Store in Pinecone
    index.upsert([(f"doc-{i}", embedding, metadata)])


In [None]:
# Checking data into the vector database after inserting the data
print(pc.Index(index_name).describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 399}},
 'total_vector_count': 399}
