### Import Libraries

In [None]:
import os
import warnings
warnings.filterwarnings("ignore")
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
import sentence_transformers
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
import time
from tqdm.auto import tqdm
from dotenv import load_dotenv
load_dotenv(override=True)

In [None]:
# check the cwd
%pwd

In [None]:
# back to the main dire
os.chdir("..")
%pwd

### Work with the data

In [4]:
# function to extract the data from the pdf files
def extracted_data(data_dir):
     loader = DirectoryLoader(path=data_dir,
                              glob="*.pdf",
                              loader_cls=PyPDFLoader)
     document = loader.load()
     return document

extracted_data = extracted_data("DataSet/")

In [5]:
# extracted_data


In [None]:
# function to split the data into chunks
def chunking(extracted_data):
     splitter = RecursiveCharacterTextSplitter(
                                   chunk_size=1000,
                                   chunk_overlap=200,
                                   separators=["\n\n", "\n", ".", " ", ""],
                                   is_separator_regex=False
                                   )
     text_chunks = splitter.split_documents(extracted_data)
     return text_chunks

text_chunks = chunking(extracted_data=extracted_data)
print(f"Text Chunks: {len(text_chunks)}")

### Model Embedding

In [None]:
# function to download the embedding model
def download_embedding_model():
     embedding_model = HuggingFaceEmbeddings(model_name="ibm-granite/granite-embedding-125m-english")
     return embedding_model

embedding_model = download_embedding_model() 

In [None]:
embedding_model

In [None]:
# test for embedding model
embed = embedding_model.embed_query("Hello Word")
print(f"Dimension: {len(embed)}")

### Create the Pinecone vector database

In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "question-answer-chatbot"
pc.create_index(name=index_name,
                dimension=768,
                metric="cosine",
                spec=ServerlessSpec(cloud="aws", region="us-east-1"))

In [None]:
def initialize_pinecone_vector_store(documents, index_name, embedding, retries=3, delay=5):
    for attempt in range(retries):
        try:
            docsearch = PineconeVectorStore.from_documents([], index_name=index_name, embedding=embedding)
            print("✅ Pinecone Vector Store created.")

            for i in tqdm(range(0, len(documents)), desc="Uploading documents", unit="doc"):
                docsearch.add_documents([documents[i]])

            print("✅ All documents uploaded successfully.")
            return docsearch

        except Exception as e:
            print(f"\n❌ Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                raise e


docsearch = initialize_pinecone_vector_store(
    documents=text_chunks,
    index_name=index_name,
    embedding=embedding_model
)