In [3]:
import os
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import PDFPlumberLoader
from transformers import AutoTokenizer, AutoModel
from dotenv import load_dotenv

load_dotenv()


True

In [4]:
#INDEX_FILE_PATH = "./faiss_index"
EMBEDDINGS_PATH = "./huggingface_bge_embeddings"

#Loading HuggingFace Embeddings
model_name = "BAAI/bge-large-en"

def get_embeddings(EMBEDDINGS_PATH, model_name):
    if not os.path.exists(EMBEDDINGS_PATH):
        print("Downloading HuggingFace embeddings...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
        tokenizer.save_pretrained(EMBEDDINGS_PATH)
        model.save_pretrained(EMBEDDINGS_PATH)
    else:
        print("Loading HuggingFace embeddings from disk...")
        tokenizer = AutoTokenizer.from_pretrained(EMBEDDINGS_PATH)
        model = AutoModel.from_pretrained(EMBEDDINGS_PATH)
    return HuggingFaceBgeEmbeddings(cache_folder=EMBEDDINGS_PATH, model_name=model_name)


embeddings = get_embeddings(EMBEDDINGS_PATH)

Loading HuggingFace embeddings from disk...




In [8]:
pc = Pinecone()

def create_pc_index(index_name,dimension=1024):   
    pc = Pinecone()
    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric="cosine",
            spec=ServerlessSpec(    
                cloud='aws', 
                region='us-east-1'
            ) 
        ) 

index_name = "genai-library"
create_pc_index(index_name, 1024)

In [9]:
#Adding docs to PINECONE index
PDFs_folder_path = "D:\\GITHUB\\ChatBot_with_MultipleSources_OpenSource_LLMs\\data"

def vector_store(PDFs_folder_path, index_name,embeddings):
    print("Adding Docs to PINECONE index...")
    vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
    pdf_files = [f for f in os.listdir(PDFs_folder_path)]
    loaders = [PDFPlumberLoader(os.path.join(PDFs_folder_path, file)) for file in pdf_files]
    docs = []
    count = 0
    for loader in loaders:
        count+=1
        print(f"Adding doc# {count}")
        docs = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        docs = text_splitter.split_documents(docs[:20])
        vectorstore.add_documents(docs)
    return vectorstore

In [10]:
vectorstore = vector_store()

Adding Docs to PINECONE index...
Adding doc# 1
Adding doc# 2
Adding doc# 3
Adding doc# 4


In [None]:
#Loading HuggingFace Embeddings
# def get_embeddings():
#     if not os.path.exists(EMBEDDINGS_MODEL_PATH):
#         print("Downloading HuggingFace embeddings...")
#         tokenizer = AutoTokenizer.from_pretrained(model_name)
#         model = AutoModel.from_pretrained(model_name)
#         tokenizer.save_pretrained(EMBEDDINGS_MODEL_PATH)
#         model.save_pretrained(EMBEDDINGS_MODEL_PATH)
#     else:
#         print("Loading HuggingFace embeddings from disk...")
#         tokenizer = AutoTokenizer.from_pretrained(EMBEDDINGS_MODEL_PATH)
#         model = AutoModel.from_pretrained(EMBEDDINGS_MODEL_PATH)
#     return HuggingFaceBgeEmbeddings(cache_folder=EMBEDDINGS_MODEL_PATH, model_name=model_name)


#INDEX_FILE_PATH = "./faiss_index"
# def vector_store():
#     print("Creating PINECONE index...")
#     if index_name not in pc.list_indexes().names():
#         pc.create_index(
#             name=index_name,
#             dimension=1536,
#             metric="cosine",
#             spec=ServerlessSpec(    
#                 cloud='aws', 
#                 region='us-east-1'
#             ) 
#         ) 
#     pdf_files = [f for f in os.listdir(PDFs_folder_path)]
#     loaders = [PDFPlumberLoader(os.path.join(PDFs_folder_path, file)) for file in pdf_files]
#     docs = []
#     for loader in loaders:
#         docs.extend(loader.load())
#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#     docs = text_splitter.split_documents(docs)
#     vectorstore = PineconeVectorStore.from_documents(
#         docs,
#         index_name=index_name,
#         embedding=embeddings
#     )
#     return vectorstore
