### Import Libraries

In [None]:
import os
import warnings
warnings.filterwarnings("ignore")
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
import sentence_transformers
from dotenv import load_dotenv
load_dotenv()


In [None]:
os.chdir("..")
%pwd

In [None]:
# Extract data from pdf files
def load_pdf_file(data):
     loader = DirectoryLoader(path=data,
                              glob="*.pdf",
                              loader_cls=PyPDFLoader)
     
     document = loader.load()
     return document


extracted_data = load_pdf_file("Data/")

In [None]:
# extracted_data

In [None]:
# Split the data into chunks
def text_split(extracted_data):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
     text_chunks = text_splitter.split_documents(extracted_data)
     return text_chunks

text_chunks = text_split(extracted_data)
print(f"Length of the text chunks: {len(text_chunks)}")

In [None]:
# Download the embedding from HuggingFace
def download_hugginface_embedding():
     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     return embeddings

embeddings = download_hugginface_embedding()

In [None]:
embeddings

In [None]:
# Test for embedding model
emb_qu = embeddings.embed_query("Hello World")
print(emb_qu)
print(f"Length {len(emb_qu)}")

In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [None]:
# Initialize Pinecone and create the index if not exists
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
# from pinecone.grpc import PineconeGRPC as Pinecone
# from pinecone import ServerlessSpec
from pinecone import Pinecone, ServerlessSpec


pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "medicalbot"
pc.create_index(name=index_name,
                dimension=384,
                metric="cosine",
                spec=ServerlessSpec(cloud="aws",region="us-east-1"))

In [None]:
index_name

In [None]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [None]:
# # Initialize the Pinecone index with retry logic
# from langchain_pinecone import PineconeVectorStore
# import time

# def initialize_pinecone_vector_store(documents, index_name, embedding, retries=3, delay=5):
#     for attempt in range(retries):
#         try:
#             docsearch = PineconeVectorStore.from_documents(
#                 documents=documents, 
#                 index_name=index_name,
#                 embedding=embedding
#             )
#             print("Pinecone Vector Store initialized successfully.")
#             return docsearch
#         except Exception as e:
#             print(f"Attempt {attempt + 1} failed: {e}")
#             if attempt < retries - 1:
#                 time.sleep(delay)
#             else:
#                 raise e

# docsearch = initialize_pinecone_vector_store(
#     documents=text_chunks, 
#     index_name=index_name,
#     embedding=embeddings
# )

In [None]:
from langchain_pinecone import PineconeVectorStore
from tqdm import tqdm
import time

def initialize_pinecone_vector_store_with_progress(documents, index_name, embedding, batch_size=32, retries=3, delay=5):
    for attempt in range(retries):
        try:
            print("Starting to upload documents to Pinecone with progress bar...")
            
            for i in tqdm(range(0, len(documents), batch_size), desc="Uploading batches"):
                batch = documents[i:i+batch_size]
                PineconeVectorStore.from_documents(
                    documents=batch,
                    index_name=index_name,
                    embedding=embedding
                )
            
            print("✅ Pinecone Vector Store initialized successfully.")
            return True
        
        except Exception as e:
            print(f"❌ Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                print(f"⏳ Retrying after {delay} seconds...")
                time.sleep(delay)
            else:
                raise e

# استخدام الدالة
success = initialize_pinecone_vector_store_with_progress(
    documents=text_chunks, 
    index_name=index_name,
    embedding=embeddings,
    batch_size=32 
)
