In [1]:
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import *
from azure.storage.blob import BlobServiceClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient

## Set environment variables

In [2]:
import os
from dotenv import load_dotenv

load_dotenv('../.env')

# AZURE AI SEARCH CREDENTIALS
searchservice = os.environ.get('searchservice')
index = os.environ.get('index')
searchkey = os.environ.get('searchkey')

# OPENAI CONFIGURATION
openai_key = os.environ.get('openai_key')

# DATA CONFIGURATION
filepath = os.environ.get('filepath')

# set credentials
search_creds = AzureKeyCredential(searchkey)

## Create functions

In [6]:
class PDFExtractor:
       
    @staticmethod
    def get_document_text(pdf_path):
        """ 
        Extracts text from pdf and returns text and page map
        
        :param pdf_path: Location of pdf file
        :return: page_text: String of concatenated text
        :return: page_map: dictionary mapping offset to page number
        """
        from pypdf import PdfReader
        offset = 0
        all_text = ""
        page_map = {}
        reader = PdfReader(pdf_path)
        pages = reader.pages 
        for page_num, page in enumerate(pages):
            page_text = page.extract_text()
            all_text += page_text
            page_map[offset] = page_num
            offset += len(page_text.split(' '))
        return all_text, page_map
    
class Chunker():

    def __init__(
        self,
        chunk_size = 500,
        overlap = 50,
        separator = " " 
    ):
        self.chunk_size = chunk_size
        self.overlap = overlap 
        self.separator = separator
    
    def split_text(self, text):
        splits = text.split(' ')
        return splits
            
    def get_chunk_indices(self, text):
        splits = self.split_text(text)
        chunk_indices = [ (i, i+self.chunk_size) for i in range(0, len(splits)-self.overlap, self.chunk_size-self.overlap)]
        return chunk_indices       
    
    def create_chunks(self, text, page_map):
        
        def find_page(start_idx, page_map):
            offset = max(x for x in page_map.keys() if x <= start_idx)
            page_num = page_map[offset]
            return page_num
        
        chunks = []
        chunk_indices = self.get_chunk_indices(text)            
        for start_idx, end_idx in chunk_indices:
            page_num = find_page(start_idx, page_map)
                
            chunk = text.split(self.separator)[start_idx:end_idx]
            chunk = self.separator.join(chunk)
            chunks.append( (chunk, page_num))
        
        return chunks
    
class Embedder:
    def __init__(
        self, 
        key=None,
        model="text-embedding-ada-002"
    ):
        from langchain_openai import OpenAIEmbeddings
        self.embedder = OpenAIEmbeddings(openai_api_key=key, model=model)
        
    def embed_in_batches(self, chunks, batch_size=16):
        num_batches = math.ceil(len(chunks) / batch_size)
        embeddings = []
        for i in range(num_batches):            
            batch = chunks[i*batch_size:i*batch_size+batch_size]
            embeddings_batch = self.embedder.embed_documents(batch)
            embeddings += embeddings_batch
        return embeddings
    
    def embed_single_document(self, text):
        embedding = self.embedder.embed_documents([text])
        return embedding
    
    
def create_sections(chunks):    
    for i, (text, page_num) in enumerate(chunks):
        yield {
            "id":str(i),
            "content":text,
            "embedding":embedder.embed_single_document(text)[0],
            "sourcepage":str(page_num),
            "sourcefile":filepath.split('/')[-1]
        }
        
def upload_documents(search_client, sections):
    """ 
    Can refactor to upload in batches
    """
    from tqdm import tqdm 
    for section in tqdm(sections, total=len(chunks)):
        _ = search_client.upload_documents(documents=[section])
    print(f"Documents uploaded")

In [7]:
if __name__ == "__main__":
    
    # create sections
    extractor = PDFExtractor()
    chunker = Chunker(chunk_size=500, overlap=100, separator=" ")
    embedder = Embedder(key=openai_key)

    text, page_map = extractor.get_document_text(filepath)
    chunks = chunker.create_chunks(text, page_map)
    sections = create_sections(chunks)        
    
    # initialize client
    search_client = SearchClient(
        endpoint = f"https://{searchservice}.search.windows.net",
        index_name=index, 
        credential = search_creds
    )
    
    upload_documents(search_client, sections)

100%|██████████| 34/34 [00:09<00:00,  3.46it/s]

Documents uploaded



