In [None]:
!pip install "pinecone[grpc]"
!pip install langchain
!pip install -U langchain-community
!pip install pypdf
!pip install tiktoken

from IPython.display import clear_output
clear_output()

In [None]:
# Import the Pinecone library
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import time

# Initialize a Pinecone client with your API key
pc = Pinecone(api_key="")

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from tqdm.autonotebook import tqdm

In [None]:
import os
# Initialize Open_API_key
os.environ["OPENAI_API_KEY"] = ""

In [None]:
embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")

  embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")


### Creating Index if not there

In [None]:
# Create a serverless index
index_name = "insurance"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension= 1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)


In [None]:
index = pc.Index(index_name)

### Step 1.1 - Load PDFs

### Step - Split the pdf data into smaller pieces, Embedding and upserting in Pinecone index

### For PDFs

In [None]:
import os
import glob
from tqdm.autonotebook import tqdm  # Import tqdm for progress bar

def list_pdf_files(folder_path):
    # Use glob to search for all .pdf files in the specified folder
    pdf_files = glob.glob(os.path.join(folder_path, '*.pdf'))
    return pdf_files

# Example usage
folder_path = '/content/pdf'
pdfs = list_pdf_files(folder_path)

print(pdfs)

# ... (other imports and setup) ...


BATCH_SIZE = 50  # Set your desired batch size

for pdf in tqdm(pdfs, desc="Processing PDFs"):  # Add progress bar for PDFs
    loader = PyPDFLoader(pdf)
    document = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)  # Reduced chunk_size

    chunked_pdf_data = []
    for data in document:
        texts = text_splitter.split_documents([data])
        chunked_pdf_data.extend(texts)

    texts = [chunk.page_content for chunk in chunked_pdf_data]
    embeddings = embeddings_model.embed_documents(texts)
    ids = [f"doc_{i}" for i in range(len(embeddings))]

    # Store only essential metadata (e.g., document name, chunk index)
    metadata = [{"source": pdf, "text":chunked_pdf_data[i].page_content, "chunk_index": i} for i in range(len(chunked_pdf_data))]

    # Upsert in batches
    for i in tqdm(range(0, len(ids), BATCH_SIZE), desc="Upserting Batches", leave=False):  # Add progress bar for batches
        batch_ids = ids[i : i + BATCH_SIZE]
        batch_embeddings = embeddings[i : i + BATCH_SIZE]
        batch_metadata = metadata[i : i + BATCH_SIZE]
        index.upsert(vectors=list(zip(batch_ids, batch_embeddings, batch_metadata)))

['/question/question and Answers in the 3 batches.pdf']


Processing PDFs:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

### For XLSX

In [None]:
import os
import glob
from tqdm.autonotebook import tqdm  # Import tqdm for progress bar
import pandas as pd  # For loading .xlsx files
from langchain.docstore.document import Document # Import Document class

def list_xlsx_files(folder_path):
    # Use glob to search for all .xlsx files in the specified folder
    xlsx_files = glob.glob(os.path.join(folder_path, '*.xlsx'))
    return xlsx_files

# Example usage
folder_path = '/content/xl'
xlsx_files = list_xlsx_files(folder_path)

print(xlsx_files)

# ... (other imports and setup) ...

BATCH_SIZE = 50  # Set your desired batch size

for xlsx_file in tqdm(xlsx_files, desc="Processing XLSX files"):  # Add progress bar for XLSX files
    df = pd.read_excel(xlsx_file)
    text_data = df.astype(str).agg(' '.join, axis=1).tolist()  # Convert dataframe rows to a list of strings

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)  # Reduced chunk_size

    chunked_xlsx_data = []
    for data in text_data:
        # Create a Document object with the string as page_content
        doc = Document(page_content=data, metadata={"source": xlsx_file})
        texts = text_splitter.split_documents([doc]) # Pass the Document object to split_documents
        chunked_xlsx_data.extend(texts)

    texts = [chunk.page_content for chunk in chunked_xlsx_data]
    embeddings = embeddings_model.embed_documents(texts)
    ids = [f"doc_{i}" for i in range(len(embeddings))]

    # Store only essential metadata (e.g., document name, chunk index)
    metadata = [{"source": xlsx_file, "text": chunked_xlsx_data[i].page_content, "chunk_index": i} for i in range(len(chunked_xlsx_data))]

    # Upsert in batches
    for i in tqdm(range(0, len(ids), BATCH_SIZE), desc="Upserting Batches", leave=False):  # Add progress bar for batches
        batch_ids = ids[i : i + BATCH_SIZE]
        batch_embeddings = embeddings[i : i + BATCH_SIZE]
        batch_metadata = metadata[i : i + BATCH_SIZE]
        index.upsert(vectors=list(zip(batch_ids, batch_embeddings, batch_metadata)))

['/content/xl/PF_A_Expenses.xlsx', '/content/xl/PF_A_BalanceSheet.xlsx', '/content/xl/GQ_Own_Funds.xlsx', '/content/xl/PF_A_Members.xlsx', '/content/xl/eiopa-qa.xlsx', '/content/xl/GA_Balance_Sheet.xlsx', '/content/xl/PF_A_ContributionsBenefitsTransfers.xlsx', '/content/xl/EIO_data_2024.xlsx', '/content/xl/SQ_Own_Funds.xlsx', '/content/xl/PF_Q_BalanceSheet.xlsx', '/content/xl/FS_Indicators.xlsx', '/content/xl/SA_LTG.xlsx', '/content/xl/Data appendix to report on cross border IORPs 2023.xlsx', '/content/xl/SA_Own_Funds.xlsx', '/content/xl/SQ_Balance_Sheet.xlsx', '/content/xl/PF_A_Exposures.xlsx', '/content/xl/GQ_Balance_Sheet.xlsx', '/content/xl/Statistical_update_on_the_use_of_capital_add-ons_2023.xlsx', '/content/xl/SQ_Exposures.xlsx', '/content/xl/SA_Balance_Sheet.xlsx', '/content/xl/PF_Q_Exposures.xlsx', '/content/xl/GA_Premiums_Claims_Expenses.xlsx', '/content/xl/SA_Cross_Border.xlsx', '/content/xl/SA_Premiums_Claims_Expenses.xlsx', '/content/xl/GA_Own_Funds.xlsx', '/content/xl/GQ_

Processing XLSX files:   0%|          | 0/26 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/323 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/57 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/66 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 16572}},
 'total_vector_count': 16572}


In [None]:
pc.delete_index("insurance")

In [None]:
pc.delete_index(index_name)