#### Importing Necessary Libraries

In [25]:
import os
import openai
import langchain
import pinecone 
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

#### Data Splitting and Chunking

In [26]:
# Reading the Dataset
def read_doc(directory):
    print(f"Files in directory: {len(os.listdir(directory))}")  # List all files
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

path = '../../data/'
doc = read_doc(path)
print(f"Number of documents loaded: {len(doc)}")

Files in directory: 20
Number of documents loaded: 382


In [27]:
## Divide the docs into chunks
### https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html#
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return docs

documents=chunk_data(docs=doc)
len(documents)
# documents[381]

382

In [28]:
# Load the .env file
load_dotenv()

# Set your OpenAI API key
api_key = os.environ.get("OPENAI_API_KEY")
pinecone_api_key = os.environ.get("PINECONE_API_KEY")


#### Embedding

In [29]:
## Embedding Technique Of OPENAI
embeddings = OpenAIEmbeddings(openai_api_key=api_key)

# Length of vector embedding for setting of vector database dimension
vectors=embeddings.embed_query("Hello, world")
len(vectors)

1536

#### Pinecone Vector Database

In [30]:
# Initializing of Pinecone Database
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone_api_key)

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)
index_name="medicalqabot"

# Checking Index before insertion
print(pc.Index(index_name).describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


#### Insertion Into Pinecone Database

In [31]:
# Inserting the embedding in the database
index = pc.Index(index_name)

for i, chunk in enumerate(documents):
    # Get the text content and metadata
    chunk_text = chunk.page_content
    metadata = chunk.metadata
        
    # Add the text content to metadata
    metadata['text'] = chunk_text
        
    # Generate embedding
    embedding = embeddings.embed_query(chunk_text)
    
    # Add vector to the batch
    index.upsert([(f"doc-{i}", embedding, metadata)])

#### Checking the Pinecone Database after insertion

In [32]:
# Checking data into the vector database after inserting the data
print(pc.Index(index_name).describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 386}},
 'total_vector_count': 386}
