In [25]:
import os
import base64
import re
import csv
from dotenv import load_dotenv, find_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import *
from azure.search.documents.models import VectorizedQuery
import openai
from openai import AzureOpenAI

In [26]:
load_dotenv()

# Get Environment settings from .env file
load_dotenv()

# Azure AI Search Index Settings
service_endpoint = f"{os.getenv('AZURE_SEARCH_SERVICE_ENDPOINT')}"
index_creds = AzureKeyCredential(os.getenv("AZURE_SEARCH_INDEX_KEY"))
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME_TEXT")

## Create a client for querying the index
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=index_creds)
image_search_client = SearchClient(endpoint=service_endpoint, index_name="images-search-without-keywords", credential=index_creds)
## Create an index
index_client = SearchIndexClient(service_endpoint, index_creds)

# Azure Openai Settings
openai.api_type = "azure"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.azure_endpoint = os.getenv("OPENAI_API_ENDPOINT")
openai.api_version = os.getenv("OPENAI_API_VERSION")

azure_openai_client = AzureOpenAI(
    api_key = os.getenv("OPENAI_API_KEY"),
    api_version = os.getenv("OPENAI_API_VERSION"),
    azure_endpoint = os.getenv("OPENAI_API_ENDPOINT")
)

In [27]:
def get_embedding(text, model="textembedding"): # model=[Deployment Name], DONOT change this
   text = text.replace("\n", " ")
   return azure_openai_client.embeddings.create(input = [text], model=model).data[0].embedding

In [28]:
sections = []
with open('ch4to6.csv', 'rt', newline='', encoding='utf-8', errors='ignore') as csvfile:
    csvreader = csv.reader(csvfile)
    for item in csvreader:
        section = {
            "id": f"{item[0]}-{item[1]}-{item[2]}",
            "Chapter": item[0],
            "Section": item[1],
            "Paragraph": item[2],
            "Content": item[3],
            "Embedding": get_embedding(item[3]),
        }
        sections.append(section)
print(f"Finished Indexing: {len(sections)} items in total")


Finished Indexing: 111 items in total


In [None]:
index = SearchIndex(
    name=index_name,
    fields=[
        SimpleField(name="id", type="Edm.String", key=True),
        SearchableField(name="Chapter", type="Edm.String", analyzer_name="standard.lucene", 
                        filterable=True, sortable=True, facetable=True, searchable=True),
        SearchableField(name="Section", type="Edm.String", analyzer_name="standard.lucene",
                        filterable=True, sortable=True, facetable=True, searchable=True),
        SearchableField(name="Paragraph", type="Edm.String", analyzer_name="standard.lucene",
                        filterable=True, sortable=True, facetable=True, searchable=True),        
        SearchableField(name="Content", type="Edm.String", analyzer_name="standard.lucene",
                        filterable=True, sortable=True, facetable=True, searchable=True),
        SearchField(name="Embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),  
            hidden=False, searchable=True, filterable=False, sortable=False, facetable=False,
            vector_search_dimensions=1536, vector_search_profile_name="my-vector-config"),
    ],
    vector_search=VectorSearch(
        profiles=[VectorSearchProfile(
            name="my-vector-config",
            algorithm_configuration_name="my-hnsw")
        ],
        algorithms=[
            HnswAlgorithmConfiguration(name="my-hnsw")
        ]
    )
)

index_client.create_or_update_index(index)

In [None]:
results = search_client.upload_documents(documents=sections)
print("Uploading")
succeeded = sum([1 for r in results if r.succeeded])
print(f"Indexed {len(results)} sections, {succeeded} succeeded")

In [29]:
query = "Who is Sun Yat-sen? How is he related to HKU?" #your query keywords
query_vector = get_embedding(query)

r = search_client.search(
    search_text=None,
    top=7,
    vector_queries=[VectorizedQuery(
        vector=query_vector,
        fields="Embedding"
    )]
)

sections = []
search_results = []
for result in r:
    print("#########################################")
    print("Source: " + result["id"])
    print("Content: " + result["Content"])
    section_id = result["id"][:-2]
    if section_id not in sections:
        sections.append(section_id)

#########################################
Source: 2-8-3
Content: Before establishing himself in Hong Kong for his further studies Sun Yat-sen had made contact with many young radical reformers who belonged to the Sanhohui, a large anti-Manchu triad in Canton, and brought to Hong Kong with him his first half-formed thoughts of Chinese reform. Chan Siu-pak, of whom we shall hear more, the son of an enlightened father who was also a Christian convert, was the first student to be enrolled in the Canton Christian College as a youth of eighteen when it was first founded in March 1888. He had heard with interest of Sun's developing ideas on reform, and paying him a visit at the College of Medicine in Hong Kong became so impressed that he dropped out of the Christian College and enrolled in the College of Medicine in order to be in daily contact with Sun, leaving with Sun but without completing the course when Sun graduated in 1892. In the College of Medicine Chan joined Sun and two other refo

In [30]:
# Getting the total number of documents in the index
# search_client.get_document_count()

# Get Neighboring Documents (Get the Whole Section)
for _section_ in sections:
    i = 1
    while(True):
        try:
            doc = search_client.get_document(key=f"{_section_}-{i}")
            print(doc["id"])
            search_results.append("Source: " + doc["id"] + "; Content: " + doc["Content"])
            i += 1
        except:
            break

2-8-1
2-8-2
2-8-3
2-8-4
2-8-5
2-8-6
2-8-7
2-8-8
2-8-9
2-8-10
2-6-1
2-6-2
2-6-3
2-6-4
3-5-1
3-8-1
3-8-2
3-8-3
3-8-4


In [None]:
systemMessage = """AI Assistant that helps user to answer questions from sources provided. Be specific in your answers.
                    Answer ONLY with the facts listed in the list of sources below.
                    After anwering the user quesitons, start a new line and give 3 keywords (names, places, etc.) of your response. Do NOT give keywords "HKU", "The University of Hong Kong", "Hong Kong".
                    If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. 
                    Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. 
                    Use square brackets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [info1.txt][info2.pdf].
                """

messages = [
    {'role' : 'system', 'content' : systemMessage},
    {'role' : 'user', 'content' : query + "   Source:" + " ".join(search_results)}
]

In [None]:
chat_completion = openai.chat.completions.create(
    model="summer", # Do not edit this. model="deployment_name"
    messages=messages, 
    temperature=0.7, 
    max_tokens=1024, 
    n=1)

chat_content = chat_completion.choices[0].message.content
keywords = chat_content.split("\n")[-1].replace("Keywords: ", "").split(", ")
print(chat_content)
print(keywords)

In [None]:
image_search_query = keywords[0]
image_search_results = []

for image_search_query in keywords:
    '''Full text search'''
    image_r = image_search_client.search(
        search_text=image_search_query,
        minimum_coverage=100,
        top=3
    )

    print('##############################################')
    image_search_keyword_result = []
    print("Query: " + image_search_query)
    print()
    for result in image_r:
        print("Source: " + result["Image_name"])
        print("Caption" + result["Caption"])
        image_search_keyword_result.append(result["Image_name"])
        print("//////////////////////////////////////////////")
    image_search_results.append(image_search_keyword_result)

In [None]:
print(image_search_results)

In [None]:
# Find the most common image in image_saeach_results
import itertools
images = list(filter(None, image_search_results))
max(images, key=images.count)