In [2]:
# Install LangChain Unstructured (which requires unstructured under the hood)

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import getpass
from pinecone import Pinecone
import os
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from unstructured.partition.pdf import partition_pdf
import glob

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Initialize embeddings and Pinecone vector store
embeddings = OpenAIEmbeddings()  #
index_name = "test-for-reference"  

  embeddings = OpenAIEmbeddings()  #


In [4]:

# Pinecone client
if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)

In [5]:
# Index Creation and its testing 
index_name = "legal-acts-sections" 
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )


In [6]:
# vector store
index = pc.Index(index_name)
vector_store = PineconeVectorStore(index=index, embedding=embeddings)


In [7]:

files = glob.glob(pathname='./**/*.pdf',recursive=True) # I have set recursive = True so that we can check subdirectories too.
print(len(files)) # confirm that you have all of the pdfs here with the correct path

541


In [None]:
chunks = []
section_content = ""
index = 1
for file_path in files:
    print("File Number:",index)
    index+=1
    elements = partition_pdf(file_path, languages=["eng", "swe"],strategy="ocr_only")
    for element in elements:
        if element.category == 'NarrativeText': # meaning that it is simmple text 
            section_content+=element.text # Then append it to the already going section content
        elif element.category=="ListItem":
            chunks.append({"page_content":section_content,"metadata":element.metadata})
            section_content="" # Because a new sectionn has started
            section_content += element.text # The string should start with the title of the text


File Number: 1


In [103]:
len(set([chunk['metadata'].filename for chunk in chunks]))

256

In [104]:
from langchain_core.documents import Document

docs = [Document(page_content=chunk['page_content'],metadata={"source":chunk['metadata'].filename}) for chunk in chunks]

In [1]:
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(docs))]
batch_size = 200
for i in range(0, len(docs), batch_size):
    print("Current Batch Index is:",i)
    batch = docs[i:i+batch_size]
    batch_ids = uuids[i:i+batch_size]
    vector_store.add_documents(batch,ids=batch_ids)

NameError: name 'docs' is not defined

In [84]:
res = vector_store.similarity_search(query="Definitions",k=1)

In [85]:
res

[Document(id='e13c1237-f9c9-4b08-9ccc-180d1367e4dc', metadata={'source': 'THE ACTING AS AGENTS OF MOALLIMS (PROHIBITION) ORDINANCE, 1980.pdf'}, page_content='2. Definitions. In this Ordinance, unless, there is anything repugnant in the subject or context,—(a) “intending pilgrim” means a person intending to proceed to Saudi Arabia to perform Haj under Haj Policy announced every year by the Federal Government; and(b) “Moallim” means a Moallim or Daleel, including a person or corporation, nominated and approved by the Government of the Kingdom of Saudi Arabia to act as Kafeel for Hajis during Haj season.')]

In [90]:
res[0].page_content

'2. Definitions. In this Ordinance, unless, there is anything repugnant in the subject or context,—(a) “intending pilgrim” means a person intending to proceed to Saudi Arabia to perform Haj under Haj Policy announced every year by the Federal Government; and(b) “Moallim” means a Moallim or Daleel, including a person or corporation, nominated and approved by the Government of the Kingdom of Saudi Arabia to act as Kafeel for Hajis during Haj season.'

In [91]:
res[0].metadata['source']

'THE ACTING AS AGENTS OF MOALLIMS (PROHIBITION) ORDINANCE, 1980.pdf'

In [None]:
for chunk in chunks:
    print(chunk.page_content)