## ElasticSearch index

In [2]:
from elasticsearch import Elasticsearch

es = Elasticsearch(
    cloud_id=cloud_id,
    api_key=api_key
)

In [3]:
# Specify the index name
# index_name = "test_index_rec"
index_name = "test_index_nlp"

In [4]:
# Define index settings and mappings
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "vector": {"type": "dense_vector", "dims": 1536},  # Assuming 1536 dimensions for the embedding
            "text": {"type": "text"}, 
            "metadata": {"type" : "object",
                "properties" : {
                    "id" : {"type" : "text"},
                    "source" : {"type" : "text"},
                    "page" : {"type" : "integer"}
                }
            }
        }
    }
}

# Create the index
es.indices.create(index=index_name, body=index_settings)

# Verify if the index is created
if es.indices.exists(index=index_name):
    print(f"Index '{index_name}' created successfully.")
else:
    print(f"Failed to create index '{index_name}'.")

Index 'test_index_nlp' created successfully.


## Embedding

In [6]:
from langchain.docstore.document import Document
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    # get only file name out of the file path
    file_name = pdf_path.split('\\')[-1]
    docs = []
    for page_num in range(doc.page_count):
        page = doc[page_num]
        docs.append(Document(page_content=page.get_text(), metadata={'page': page_num, 'source': file_name}))
    doc.close()

    return docs

def process_pdfs_in_folder(folder_path):
    all_docs = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        print(file_path)
        if os.path.isdir(file_path):
            # If it's a folder, call the function recursively
            process_pdfs_in_folder(file_path)
        elif filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            docs = extract_text_from_pdf(pdf_path)
            all_docs.append(docs)
    return all_docs


In [7]:
# Set your folder path, chunk size, and overlap
folder_path = "sample"

loaded_documents = process_pdfs_in_folder(folder_path)
# convert list of lists to a single list
loaded_documents = [item for sublist in loaded_documents for item in sublist]

print(f"Length of loaded pages: {len(loaded_documents)}")

sample\AFM_annualreport_2022.pdf
sample\mckinsey-tech-trends-outlook-2022-full-report.pdf
sample\mgi-reinventing-construction-a-route-to-higher-productivity-full-report.pdf
sample\Procter&Gamble_annualreport_2023.pdf
sample\the-state-of-organizations-2023.pdf
Length of loaded pages: 724


In [8]:
from langchain.text_splitter import SpacyTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split docs in chunks
text_splitter = SpacyTextSplitter(
# text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=64
)

nlp_texts = text_splitter.split_documents(loaded_documents)

Created a chunk of size 558, which is longer than the specified 512
Created a chunk of size 730, which is longer than the specified 512
Created a chunk of size 710, which is longer than the specified 512
Created a chunk of size 846, which is longer than the specified 512
Created a chunk of size 659, which is longer than the specified 512
Created a chunk of size 832, which is longer than the specified 512
Created a chunk of size 614, which is longer than the specified 512
Created a chunk of size 671, which is longer than the specified 512
Created a chunk of size 628, which is longer than the specified 512
Created a chunk of size 741, which is longer than the specified 512
Created a chunk of size 531, which is longer than the specified 512
Created a chunk of size 626, which is longer than the specified 512
Created a chunk of size 591, which is longer than the specified 512
Created a chunk of size 559, which is longer than the specified 512
Created a chunk of size 764, which is longer tha

In [9]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

embedding= OpenAIEmbeddings()

In [10]:
embeddings_vectors = FAISS.from_documents(nlp_texts,embedding)

## Ingest the index

In [13]:
# Delete all documents
es.delete_by_query(index=index_name, body={"query": {"match_all": {}}})

ObjectApiResponse({'took': 56, 'timed_out': False, 'total': 237, 'deleted': 237, 'batches': 1, 'version_conflicts': 0, 'noops': 0, 'retries': {'bulk': 0, 'search': 0}, 'throttled_millis': 0, 'requests_per_second': -1.0, 'throttled_until_millis': 0, 'failures': []})

In [14]:
# add to the elastic search each chunk with their embeddings, text and metadata
from tqdm import tqdm

page_count = 0
curr_doc = None
for i, doc in tqdm(enumerate(nlp_texts), total=len(nlp_texts), desc='Indexing Documents'):
    if curr_doc != doc.metadata['source']:
        page_count = 0
        curr_doc = doc.metadata['source']
    else:
        page_count += 1
    es.index(index=index_name, body={"vector": embeddings_vectors.index.reconstruct_n(i, 1)[0], 
                                     "text": doc.page_content, 
                                     "metadata": {"id":(doc.metadata['source']+"_"+str(page_count)),"source": doc.metadata['source'], "page": doc.metadata['page']}
                                     })


Indexing Documents: 100%|██████████| 4372/4372 [09:17<00:00,  7.85it/s]
