# RAG Demo (24summer)

### 1. Install dependencies

In [None]:
# Upgrade pip, restart the kernel
%pip install --upgrade pip

In [None]:
# Restart the kernel after installing
%pip install -r requirements.txt

### 2. Setup the environment
We use two Microsoft Azure API
- Azure AI Search
- Azure OpenAI

In [None]:
import os
import base64
import re
from dotenv import load_dotenv
from pypdf import PdfReader, PdfWriter
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import *
from azure.search.documents.models import VectorizedQuery
import openai
from openai import AzureOpenAI

In [None]:
# Get Environment settings from .env file
load_dotenv()

# Azure AI Search Index Settings
service_endpoint = f"{os.getenv('AZURE_SEARCH_SERVICE_ENDPOINT')}"
index_creds = AzureKeyCredential(os.getenv("AZURE_SEARCH_INDEX_KEY"))
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")

## Create a client for querying the index
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=index_creds)
## Create an index
index_client = SearchIndexClient(service_endpoint, index_creds)

# Azure Openai Settings
openai.api_type = "azure"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.azure_endpoint = os.getenv("OPENAI_API_ENDPOINT")
openai.api_version = os.getenv("OPENAI_API_VERSION")

azure_openai_client = AzureOpenAI(
    api_key = os.getenv("OPENAI_API_KEY"),
    api_version = os.getenv("OPENAI_API_VERSION"),
    azure_endpoint = os.getenv("OPENAI_API_ENDPOINT")
)

### 3. Create Search Index
`name`: a unique name for the Index.
- Think of the index as a **directory**/**folder**

`fields`: a list of fields that will be fed into the index.
- Think of a field as a **tag**
- *MUST CONTAIN A KEY FIELD:* string field, as the unique **identifier** (An ID to each document stored with the index). Document IDs are case sensitive**
- `SimpleField`
- `SearchableField` must have a specified analyzer
- `SearchField`

`vector_search`: the vector search method for the index.

Note: Every Index should have a different name. So re-running this block may lead to error.

In [None]:
fields = [
    SimpleField(name="id", type="Edm.String", key=True), # The Key Field
    SearchableField(name="content", type="Edm.String", analyzer_name="standard.lucene"),
    SearchField(name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),  
                hidden=False, searchable=True, filterable=False, sortable=False, facetable=False,
                vector_search_dimensions=1536, vector_search_profile_name="my-vector-config"),
    SimpleField(name="sourcepage", type="Edm.String", filterable=True, facetable=True),
    SimpleField(name="sourcefile", type="Edm.String", filterable=True, facetable=True),
]

index = SearchIndex(
    name=index_name, 
    fields=fields,    
    vector_search=VectorSearch(
        profiles=[VectorSearchProfile(
            name="my-vector-config",
            algorithm_configuration_name="my-hnsw")
        ],
        algorithms=[
            HnswAlgorithmConfiguration(name="my-hnsw")
        ]
    )
)

result = index_client.create_index(index)
# result = client.create_or_update_index(index, allow_index_downtime=True)

### 4. Index the document
Think of this as Slicing the documents.

To test out your own PDF docs, add to `data` folder and change the filename.

In [None]:
filename="./data/" + "The_Innovation_Wings.pdf" #Change to name of your file (make sure the file name does not include any space)

def compute_embedding(text, model="textembedding"): # model=[Deployment Name], DONOT change this
   text = text.replace("\n", " ")
   return azure_openai_client.embeddings.create(input = [text], model=model).data[0].embedding

offset = 0       #The character count from the start of the document
page_map = []    #List of tuples: (page_num, offset, page_text)

print(f"Extracting text from '{filename}' using PdfReader")

reader = PdfReader(filename)
pages = reader.pages
for page_num, p in enumerate(pages):
    page_text = p.extract_text()
    page_map.append((page_num, offset, page_text))
    offset += len(page_text)

In [None]:
MAX_SECTION_LENGTH = 1000
SENTENCE_SEARCH_LIMIT = 100
SECTION_OVERLAP = 100


def filename_to_id(filename): 
    filename_ascii = re.sub("[^0-9a-zA-Z_-]", "_", filename)
    filename_hash = base64.b16encode(filename.encode('utf-8')).decode('ascii')
    return f"file-{filename_ascii}-{filename_hash}"

def split_text(page_map):
    SENTENCE_ENDINGS = [".", "!", "?"]
    WORDS_BREAKS = [",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", " "]

    def find_page(offset):
        l = len(page_map)
        for i in range(l - 1):
            if offset >= page_map[i][1] and offset < page_map[i + 1][1]:
                return i
        return l - 1

    all_text = "".join(p[2] for p in page_map)
    length = len(all_text)
    start = 0
    end = length
    while start + SECTION_OVERLAP < length:
        last_word = -1
        end = start + MAX_SECTION_LENGTH

        if end > length:
            end = length
        else:
            # Try to find the end of the sentence
            while end < length and (end - start - MAX_SECTION_LENGTH) < SENTENCE_SEARCH_LIMIT and all_text[end] not in SENTENCE_ENDINGS:
                if all_text[end] in WORDS_BREAKS:
                    last_word = end
                end += 1
            if end < length and all_text[end] not in SENTENCE_ENDINGS and last_word > 0:
                end = last_word # Fall back to at least keeping a whole word
        if end < length:
            end += 1

        # Try to find the start of the sentence or at least a whole word boundary
        last_word = -1
        while start > 0 and start > end - MAX_SECTION_LENGTH - 2 * SENTENCE_SEARCH_LIMIT and all_text[start] not in SENTENCE_ENDINGS:
            if all_text[start] in WORDS_BREAKS:
                last_word = start
            start -= 1
        if all_text[start] not in SENTENCE_ENDINGS and last_word > 0:
            start = last_word
        if start > 0:
            start += 1

        section_text = all_text[start:end]
        yield (section_text, find_page(start))

        last_table_start = section_text.rfind("<table")
        if (last_table_start > 2 * SENTENCE_SEARCH_LIMIT and last_table_start > section_text.rfind("</table")):
            # If the section ends with an unclosed table, we need to start the next section with the table.
            # If table starts inside SENTENCE_SEARCH_LIMIT, we ignore it, as that will cause an infinite loop for tables longer than MAX_SECTION_LENGTH
            # If last table starts inside SECTION_OVERLAP, keep overlapping
            start = min(end - SECTION_OVERLAP, start + last_table_start)
        else:
            start = end - SECTION_OVERLAP
        
    if start + SECTION_OVERLAP < end:
        yield (all_text[start:end], find_page(start))

# Organizing documents to upload to the Index
sections = []
file_id = filename_to_id(filename)
for i, (content, pagenum) in enumerate(split_text(page_map)):
    section = {
        "id": f"{file_id}-page-{i}",
        "content": content,
        "embedding": compute_embedding(content),
        "sourcepage": os.path.splitext(os.path.basename(filename))[0] + f"-{pagenum + 1}",
        "sourcefile": filename
    }
    sections.append(section)

In [None]:
i = 0
batch = []
#index 1000 sections at a time
for s in sections:
    batch.append(s)
    i += 1
    if i % 1000 == 0:
        results = search_client.upload_documents(documents=batch)
        succeeded = sum([1 for r in results if r.succeeded])
        print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")
        batch = []
        
#index the remaining sections
if len(batch) > 0:
    results = search_client.upload_documents(documents=batch)
    succeeded = sum([1 for r in results if r.succeeded])
    print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")

We have completed all the Preparations for RAG. 

We have built an AI search index \(database\) for **Retrieving relevant information**, which will be used in Section 5.2.

### 5. Try Out RAG

ChatGPT is implemented for **Text Generation**.

Test out Section 5.1 first to see what it can do!

#### 5.0 **DEMO:** ChatGPT on its own \(Without RAG\)
Try asking ChatGPT some questions!

In [None]:
#change the query to what you want to ask chatGPT
query = "What is the capital city of India?"

messages = [
    {'role' : 'user', 'content' : query }
]

chat_completion = openai.chat.completions.create(
    model="summer", # Do not edit this. model="deployment_name"
    messages=messages, 
    temperature=0.7, 
    max_tokens=1024, 
    n=1)

chat_content = chat_completion.choices[0].message.content
chat_content

Now Modify the role of ChatGPT through prompts. You can:
- Add some system message
- Add few shot examples

In [None]:
#change the query to what you want to ask chatGPT
query = "Write a poem about university life"

#change the systemMessage to how you want chatGPT to behave
systemMessage = '''You are a Shakespearean writing assistant who speaks in a Shakespearean style. 
                    You help people come up with creative ideas and content like stories, poems, and songs that use Shakespearean style of writing style, including words like "thou" and "hath”.
                    Here are some example of Shakespeare's style:
                    - Romeo, Romeo! Wherefore art thou Romeo?
                    - Love looks not with the eyes, but with the mind; and therefore is winged Cupid painted blind.
                    - Shall I compare thee to a summer's day? Thou art more lovely and more temperate.'''

messages = [
    {'role' : 'system', 'content' : systemMessage},
    #change the content here to your example question
    {'role' : 'user', 'content' : 'Please write a short text turning down an invitation to dinner.'},
    #change the content here to your example answer
    {'role' : 'assistant', 'content' : '''Dearest,
                                        Regretfully, I must decline thy invitation.
                                        Prior engagements call me hence. Apologies.'''},
    {'role' : 'user', 'content' : query }
]

chat_completion = openai.chat.completions.create(
    model="summer", # Do not edit this. model="deployment_name"
    messages=messages, 
    temperature=0.7, 
    max_tokens=1024, 
    n=1)

chat_content = chat_completion.choices[0].message.content
print(chat_content)

#### 5.2 Implementing a RAG model

Obtain related information using Vector Search

In [None]:
query = "What is SIG?" #your query keywords
query_vector = compute_embedding(query)

def nonewlines(s: str) -> str:
    return s.replace(' ', ' ').replace('\r', ' ')

r = search_client.search(
    search_text=None,
    top=3,
    vector_queries=[VectorizedQuery(
        vector=query_vector,
        fields="embedding"
    )]
)

results = [doc["sourcepage"] + ": " + nonewlines(doc["content"]) for doc in r]

for result in r:
    print("Source: " + result["sourcepage"])
    print(nonewlines(result["content"]))

Query GPT with **Query** + **Retrieved Information**

In [None]:
#change the systemMessage to how you want chatGPT to behave
systemMessage = """AI Assistant that helps user to answer questions from sources provided. Be brief in your answers.
                    Answer ONLY with the facts listed in the list of sources below. 
                    If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. 
                    Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. 
                    Use square brackets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [info1.txt][info2.pdf].
                """

messages = [
    {'role' : 'system', 'content' : systemMessage},
    {'role' : 'user', 'content' : query + "   Source:" + " ".join(results)}
]

chat_completion = openai.chat.completions.create(
    model="summer", # Do not edit this. model="deployment_name"
    messages=messages, 
    temperature=0.7, 
    max_tokens=1024, 
    n=1)

chat_content = chat_completion.choices[0].message.content
print(chat_content)