In [None]:
# ===============================================================================================================#
# Copyright 2024 Infosys Ltd.                                                                          #
# Use of this source code is governed by Apache License Version 2.0 that can be found in the LICENSE file or at  #
# http://www.apache.org/licenses/                                                                                #
# ===============================================================================================================#

## Tool 02 - Semantic Search (Interactive)
To demonstrate the applicability of `uc_41_retrieval_online.ipynb` for building interactive UI.

#### Import libraries

In [None]:
import os
import re
import json
import shutil
import infy_dpp_sdk
import infy_fs_utils
from _internal_utils.pipeline_helper import PipelineHelper
from IPython.display import display, HTML, Markdown
from _internal_utils.semantic_search_visualizer import SemanticSearchVisualizer

#### Set environment variables
<div  style="line-height: 1;">
    <span style="color:Red"><b>NOTE:</b> The Pipeline uses environment variables which needs to be set by the developer.<br>
In production developer needs to set them as required.<br>
In this notebook you can provide them using the below code.<br>
To set or change the value please refer <i>installation.ipynb</i></span>
</div>

In [None]:
%store -r CUSTOM_LLM_MIXTRAL_INFERENCE_URL
os.environ['CUSTOM_LLM_MIXTRAL_INFERENCE_URL']=CUSTOM_LLM_MIXTRAL_INFERENCE_URL

%store -r OPENAI_KEY
os.environ['OPENAI_KEY'] = OPENAI_KEY
%store -r OPENAI_SERVER_URL
os.environ['OPENAI_SERVER_URL']=OPENAI_SERVER_URL

#### Define configuration file path

In [None]:
STORAGE_ROOT_PATH = 'C:/DPP/infy_libraries_client/tool/STORAGE'
CONTAINER_ROOT_PATH = 'C:/DPP/infy_libraries_client/tool/CONTAINER'
PIPELINE_INPUT_CONFIG_FILE_PATH = '/data/config/dpp_pipeline_retriever_online_input_config.json'

#### Copying files
<div  style="line-height: 1;"><span style="color:Red"><b>NOTE: </b>In this notebook below is used to copy sample files to folders in <i>STORAGE_ROOT_PATH</i>.<br>
In production the data and config files should kept under respective folders in <i>STORAGE_ROOT_PATH </i>.<br>
<span></div>

In [None]:
curr_data = os.path.abspath('../data')
if not os.path.exists(f'{STORAGE_ROOT_PATH}/data'):
    os.makedirs(f'{STORAGE_ROOT_PATH}/data')
if not os.path.exists(f'{STORAGE_ROOT_PATH}/data/input'):
    os.makedirs(f'{STORAGE_ROOT_PATH}/data/input')
shutil.copytree(f'{curr_data}/sample/config',f'{STORAGE_ROOT_PATH}/data/config',
                dirs_exist_ok=True)
if not os.path.exists(f'{STORAGE_ROOT_PATH}/data/vectordb'):
    os.makedirs(f'{STORAGE_ROOT_PATH}/data/vectordb')
curr_data = os.path.abspath('./data')
shutil.copytree(f'{curr_data}/sample/vectordb',f'{STORAGE_ROOT_PATH}/data/vectordb',
                dirs_exist_ok=True)    

#### Initialize Client Config

In [None]:
storage_config_data = infy_fs_utils.data.StorageConfigData(
        **{
            "storage_root_uri": f"file://{STORAGE_ROOT_PATH}",
            "storage_server_url": "",
            "storage_access_key": "",
            "storage_secret_key": ""
        })

client_config_data = infy_dpp_sdk.ClientConfigData(
    **{
        "container_data": {
            "container_root_path": f"{CONTAINER_ROOT_PATH}",
        }
    })
file_sys_handler = infy_fs_utils.provider.FileSystemHandler(
    storage_config_data)
if not infy_fs_utils.manager.FileSystemManager().has_fs_handler(
    infy_dpp_sdk.common.Constants.FSH_DPP):
    infy_fs_utils.manager.FileSystemManager().add_fs_handler(
        file_sys_handler,
        infy_dpp_sdk.common.Constants.FSH_DPP)
infy_dpp_sdk.ClientConfigManager().load(client_config_data)

#### Initialize Logging

In [None]:
logging_config_data = infy_fs_utils.data.LoggingConfigData(
        **{
            # "logger_group_name": "my_group_1",
            "logging_level": 10,
            "logging_format": "",
            "logging_timestamp_format": "",
            "log_file_data": {
                "log_file_dir_path": "/logs",
                "log_file_name_prefix": "inference",
                # "log_file_name_suffix": "1",
                "log_file_extension": ".log"

            }})

In [None]:
if not infy_fs_utils.manager.FileSystemLoggingManager().has_fs_logging_handler(
    infy_dpp_sdk.common.Constants.FSLH_DPP):
    infy_fs_utils.manager.FileSystemLoggingManager().add_fs_logging_handler(
            infy_fs_utils.provider.FileSystemLoggingHandler(
                logging_config_data, file_sys_handler),
            infy_dpp_sdk.common.Constants.FSLH_DPP)

### Inference Pipeline - Q&A visualization 

In [None]:
# ---- Create response data -----
metadata = infy_dpp_sdk.data.MetaData(
    standard_data=infy_dpp_sdk.data.StandardData(
        filepath=infy_dpp_sdk.data.ValueData()))
document_data = infy_dpp_sdk.data.DocumentData(metadata=metadata)
context_data = {
}
response_data = infy_dpp_sdk.data.ProcessorResponseData(
    document_data=document_data, context_data=context_data)
document_data_json=json.loads(response_data.json(indent=4))

In [None]:
def highlight_words(orig_content, query, reduce=False):
    STOP_WORDS = """
    a, an, and, are, as, at, be, but, by, for, if, in, into, is, it, no, not, of, on, or, s, such, t,
    that, the, their, then, there, these, they, this, to, was, what, will, with
    """
    stop_words = [x.strip() for x in STOP_WORDS.split(',') if x.strip() ]
    content = orig_content
    query_words = [ x for x in re.sub('[^A-Za-z0-9]+', ' ', query).split(' ') if x ]
    query_words
    for query_word in query_words:
        if query_word.lower() in stop_words:
            continue
        pattern = r'\b' + query_word + r'\b'
        query_matches = set(re.findall(pattern, content, re.IGNORECASE))
        for query_match in query_matches:
            pattern = r'\b' + query_match + r'\b'
            content = re.sub(pattern, f'<b>{query_match}</b>', content)
    if reduce:
        lines = content.split('\n')
        lines = [x for x in lines if '<b>' in x]
        content = '\n'.join(lines)
    return content

In [None]:
ss_visualizer = SemanticSearchVisualizer()

def form_submit_button_clicked(_):
    user_query = ss_visualizer.get_input_text()
    
    ss_visualizer.set_output_text('Fetching. Please wait...')
    
    input_config_data=json.loads(file_sys_handler.read_file(
                        PIPELINE_INPUT_CONFIG_FILE_PATH))
    input_config_data['processor_input_config']['QueryRetriever']['queries'\
                                                                 ][0]['question']=user_query
    file_sys_handler.write_file(PIPELINE_INPUT_CONFIG_FILE_PATH,json.dumps(\
                                input_config_data,indent=4))
    
    dpp_orchestrator = infy_dpp_sdk.orchestrator.OrchestratorNativeBasic(
        input_config_file_path=PIPELINE_INPUT_CONFIG_FILE_PATH)
    processor_response_list = dpp_orchestrator.run_batch(
                    [infy_dpp_sdk.data.DocumentData(**document_data_json.\
                                                    get('document_data'))],
                    [document_data_json.get('context_data')])
    with open("./data/processor_response_data_list.json", "w") as f:
        json.dump(processor_response_list[0].dict(), f, indent = 4)
   
        
    queries_list=processor_response_list[0].context_data.get("query_retriever").get("queries")
    output = ''
    for query in queries_list:
        # total_records = len(query['top_k_matches'])
        #
        top_k_matches_list=query['top_k_matches']
        match_list=[match for match_dict in top_k_matches_list for match_list in match_dict.values() for match in match_list]
        total_records = len(match_list)
        #
        for idx, match in enumerate(match_list):
            # print(match)
            raw_content = match['content']
            result = highlight_words(raw_content, user_query, reduce=True)
            html_content = raw_content.replace('\n','<br/>')
            card_html = f"""
            <b># {idx+1} of {total_records}</b><br/>
            <b>Distance:</b> {match['score']}<br/>
            <div style='line-height: 14pt;'>
                {result}
            </div>
            <div style='line-height: 14pt; border:0px solid red; background-color: #eee;'>
                <details>
                    <summary>Chunk</summary>
                    {html_content}
                </details>
            </div>
            <b>Source:</b> {match['meta_data']['doc_name']} | <b>Page: </b> {match['meta_data']['page_no']}            
            """
            card_html += "<br/><hr/>" if idx+1 < total_records else ''
            output+= card_html
            
    ss_visualizer.set_output_text(output)
    
def count_tokens(text):
    # Custom token count logic to be added here. Sample below.
    count = len(text)
    return count

help_html="""
**Sample Question(s):**   
What is the percentage of women employees in Infosys?  
What's the operating margin?  
What are list of equipment involved in Virtual Reality (VR)?  
Which football player has scored 15 goals from table?  
In which year was Infosys Certified excellent in employee conditions?  
<hr/>
"""

display(Markdown(help_html))
ss_visualizer.on_form_submit_callback(form_submit_button_clicked)
ss_visualizer.set_token_counter_fn(count_tokens)
ss_visualizer.show_ui()