In [36]:
import torch
import json
import os 
import pdfplumber

from InstructorEmbedding import INSTRUCTOR
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex
)

from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery


### Getting AZ Search Credentials

In [3]:
ENDPOINT = os.getenv("vs-url") ## vector search url
CREDENTIALS = AzureKeyCredential(os.getenv("vs-url")) ## vector search primary key


### Reading PDF

In [19]:
def pars_pdf(pdf_path: str) -> dict:
   text_dict = {}
   with pdfplumber.open(pdf_path) as pdf:
      text = ''.join(page.extract_text() for page in pdf.pages)
   
   text_dict['id'] = pdf_path.split("\\")[-1].split(".")[0] ## file name will be id
   text_dict['title'] = text.splitlines()[0] ## title
   text_dict['content'] = text
   return text_dict

### Getting Embedding of PDF

In [5]:
# model = NSTRUCTOR('hkunlp/instructor-large') ## run this first time when hugging face embedding model is not saved locally
model = torch.load('embedding_model.pth') 

In [6]:
def get_embedding(content: str) ->list:
    embeddings = model.encode([content])
    return embeddings.flatten().tolist()

### Creating Index

In [57]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=ENDPOINT, credential=CREDENTIALS)
## maker sure vector_search_profile_name is same as you define later in VectorSearchProfile
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=768, vector_search_profile_name="pdfHnswProfile")
]

In [58]:
# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[HnswAlgorithmConfiguration(name="pdfHnsw")],
    profiles=[ VectorSearchProfile(name="pdfHnswProfile",
                                    algorithm_configuration_name="pdfHnsw") ## this name should match with algorithm name
                ]
)

In [59]:

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="title")
    )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

In [60]:
# Create the search index with the semantic settings
index_name = "pdf_index"
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 pdf_index created


### Upload documents

In [61]:
for file_name in ["10001727.pdf", "10176815.pdf", "10399912.pdf", "11152490.pdf"]:
    ## parse pdf
    pdf_dict = pars_pdf(os.path.join("data", "input", file_name))

    ## vectorize data
    vec_data = {}
    vec_data['id'] = pdf_dict['id']
    vec_data['title'] = pdf_dict['title']
    vec_data['content'] = pdf_dict['content']
    vec_data['contentVector'] = get_embedding(pdf_dict["content"])

    ## upload to Azure vector store
    search_client = SearchClient(endpoint=ENDPOINT, index_name=index_name, credential=CREDENTIALS)
    result = search_client.upload_documents(vec_data)
    print(f"{file_name} Uploaded")  

10001727.pdf Uploaded
10176815.pdf Uploaded
10399912.pdf Uploaded
11152490.pdf Uploaded


### Vector Search

In [84]:
# Pure Vector Search
query = "Resume of aviation engineer"  
  
embedding = get_embedding(query)
vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=5, fields="contentVector")
  
## searching for vectorized query
results = search_client.search(  
    search_text=None,   ## None mean we are not searching text at all
    vector_queries= [vector_query],
    select=["id", "title", "content"],
) 

In [85]:
for result in results:  
    print(f"id = {result['id']}")
    print(f"title = {result['title']}")
    print(f"Score: {result['@search.score']}") 
    print("\n")

id = 10176815
title = AVIATION ELECTRONICS TECHNICIAN
Score: 0.9150263


id = 10399912
title = HR PERSONNEL ASSISTANT
Score: 0.86070436


id = 10001727
title = SOUS CHEF
Score: 0.85298055


id = 11152490
title = DEPUTY PRINCIPAL
Score: 0.844396


