In [1]:
from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import BlobServiceClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import *
from azure.search.documents import SearchClient

from config import *
from utils.data_utils import *
import pandas as pd

In [2]:
search_creds = AzureKeyCredential(searchkey)
storage_creds = storagekey
openai_api_key = ""

In [33]:
class PDFExtractor:
       
    @staticmethod
    def get_document_text(pdf_path):
        """ 
        Extracts text from pdf and returns text and page map
        
        :param pdf_path: Location of pdf file
        :return: page_text: String of concatenated text
        :return: page_map: dictionary mapping offset to page number
        """
        from pypdf import PdfReader
        offset = 0
        all_text = ""
        page_map = {}
        reader = PdfReader(pdf_path)
        pages = reader.pages 
        for page_num, page in enumerate(pages):
            page_text = page.extract_text()
            all_text += page_text
            page_map[offset] = page_num
            offset += len(page_text.split(' '))
        return all_text, page_map
    
class Chunker():

    def __init__(
        self,
        chunk_size = 500,
        overlap = 50,
        separator = " " 
    ):
        self.chunk_size = chunk_size
        self.overlap = overlap 
        self.separator = separator
    
    def split_text(self, text):
        splits = text.split(' ')
        return splits
            
    def get_chunk_indices(self, text):
        splits = self.split_text(text)
        chunk_indices = [ (i, i+self.chunk_size) for i in range(0, len(splits)-self.overlap, self.chunk_size-self.overlap)]
        return chunk_indices       
    
    def create_chunks(self, text, page_map):
        
        def find_page(start_idx, page_map):
            offset = max(x for x in page_map.keys() if x <= start_idx)
            page_num = page_map[offset]
            return page_num
        
        chunks = []
        chunk_indices = self.get_chunk_indices(text)            
        for start_idx, end_idx in chunk_indices:
            page_num = find_page(start_idx, page_map)
                
            chunk = text.split(self.separator)[start_idx:end_idx]
            chunk = self.separator.join(chunk)
            chunks.append( (chunk, page_num))
        
        return chunks
    
class Embedder:
    def __init__(
        self, 
        key=None,
        model="text-embedding-ada-002"
    ):
        from langchain_openai import OpenAIEmbeddings
        self.embedder = OpenAIEmbeddings(openai_api_key=key, model=model)
        
    def embed_in_batches(self, chunks, batch_size=16):
        num_batches = math.ceil(len(chunks) / batch_size)
        embeddings = []
        for i in range(num_batches):            
            batch = chunks[i*batch_size:i*batch_size+batch_size]
            embeddings_batch = self.embedder.embed_documents(batch)
            embeddings += embeddings_batch
        return embeddings
    
    def embed_single_document(self, text):
        embedding = self.embedder.embed_documents([text])
        return embedding
    
    
def create_sections(chunks):
    for i, (text, page_num) in enumerate(chunks):
        yield {
            "id":str(i),
            "content":text,
            "embedding":embedder.embed_single_document(text)[0],
            "sourcepage":str(page_num),
            "sourcefile":FILE_PATH.split('/')[-1]
        }

In [34]:
extractor = PDFExtractor()
chunker = Chunker(chunk_size=500, overlap=100, separator=" ")

text, page_map = extractor.get_document_text(FILE_PATH)
chunks = chunker.create_chunks(text, page_map)
sections = create_sections(chunks)        

In [39]:
def create_search_index(index):
    
    print(f"Ensuring search index {index} exists")
    index_client = SearchIndexClient(
        endpoint=f"https://{searchservice}.search.windows.net/",
        credential=search_creds 
    )
    
    if index not in index_client.list_index_names():
        
        # configure the index 
        fields = [
            SimpleField(
                name="id", 
                type=SearchFieldDataType.String, 
                key=True,
                sortable=True,
                filterable=True,
                facetable=True
                ),
            SearchableField(
                name="content",
                type=SearchFieldDataType.String,
                analyzer_name="en.microsoft" 
            ),
            SearchField(
                name="embedding",
                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True,
                vector_search_dimensions=1536,
                vector_search_profile_name="mlops-vector-profile"
            ),
            SimpleField(
                name="sourcepage",
                type=SearchFieldDataType.String,
                filterable=True,
                facetable=True 
            ),
            SimpleField(
                name="sourcefile",
                type=SearchFieldDataType.String,
                filterable=False,
                facetable=False
            )
        ]

        # configure the vector search configuration
        vector_search = VectorSearch(
            profiles=[VectorSearchProfile(name="mlops-vector-profile", algorithm_configuration_name="mlops-vector-search-algo")],
            algorithms=[HnswAlgorithmConfiguration(name="mlops-vector-search-algo")]
        )
        
        # configure semantic search
        semantic_config = SemanticConfiguration(
            name="mlops-semantic-config",
            prioritized_fields=SemanticPrioritizedFields(
                content_fields=[SemanticField(field_name="content")]
            )
        )
        semantic_search = SemanticSearch(
            configurations=[semantic_config]
        )
        
        # create the search index with vector and semantic settings
        index = SearchIndex(
            name=index,
            fields=fields,
            vector_search=vector_search,
            semantic_search=semantic_search 
        )
        result = index_client.create_or_update_index(index)
        print(f"{result.name} created")
        
    else:
        print(f"Search index {index} already exists")
    

In [40]:
create_search_index(index)

Ensuring search index mlops-rag exists
mlops-rag created
