In [None]:
# ===============================================================================================================#
# Copyright 2023 Infosys Ltd.                                                                          #
# Use of this source code is governed by Apache License Version 2.0 that can be found in the LICENSE file or at  #
# http://www.apache.org/licenses/                                                                                #
# ===============================================================================================================#

In [None]:
import os
import re
import json
import shutil
import infy_dpp_sdk
import infy_fs_utils
from IPython.display import display, HTML, Markdown
from _internal_utils.pipeline_helper import PipelineHelper
from _internal_utils.semantic_search_visualizer import SemanticSearchVisualizer

#### Set environment variables
<div  style="line-height: 1;">
    <span style="color:Red"><b>NOTE:</b> The Pipeline uses environment variables which needs to be set by the developer.<br>
In production developer needs to set them as required.<br>
In this notebook you can provide them using the below code.<br>
To set or change the value please refer <i>installation.ipynb</i></span>
</div>

In [None]:
%store -r CUSTOM_LLM_BLOOM_INFERENCE_URL
%store -r CUSTOM_LLM_LLAMA_INFERENCE_URL
%store -r CUSTOM_LLM_MIXTRAL_INFERENCE_URL
%store -r CUSTOM_LLM_LLAMA_3_1_INFERENCE_URL
os.environ['CUSTOM_LLM_BLOOM_INFERENCE_URL']=CUSTOM_LLM_BLOOM_INFERENCE_URL
os.environ['CUSTOM_LLM_LLAMA_INFERENCE_URL']=CUSTOM_LLM_LLAMA_INFERENCE_URL
os.environ['CUSTOM_LLM_MIXTRAL_INFERENCE_URL']=CUSTOM_LLM_MIXTRAL_INFERENCE_URL
os.environ['CUSTOM_LLM_LLAMA_3_1_INFERENCE_URL']=CUSTOM_LLM_LLAMA_3_1_INFERENCE_URL

%store -r OPENAI_KEY
os.environ['OPENAI_KEY'] = OPENAI_KEY
%store -r OPENAI_SERVER_URL
os.environ['OPENAI_SERVER_URL']=OPENAI_SERVER_URL

%store -r CUSTOM_EMB_MISTRAL_INFERENCE_URL
os.environ['CUSTOM_EMB_MISTRAL_INFERENCE_URL']=CUSTOM_EMB_MISTRAL_INFERENCE_URL

#### Define configuration file path

In [None]:
STORAGE_ROOT_PATH = 'C:/DPP/infy_libraries_client/STORAGE'
CONTAINER_ROOT_PATH = 'C:/DPP/infy_libraries_client/CONTAINER'

#### Copying files
<div style="line-height: 1;">
<span style="color:Red"><b>NOTE: </b>In this notebook below is used to copy sample files to folders in <i>STORAGE_ROOT_PATH</i>.<br>
In production the data and config files should be kept under respective folders in <i>STORAGE_ROOT_PATH </i>.<br>
</span>
</div>

In [None]:
curr_data = os.path.dirname(os.getcwd())+'/data'
if not os.path.exists(f'{STORAGE_ROOT_PATH}/data'):
    os.makedirs(f'{STORAGE_ROOT_PATH}/data')
if not os.path.exists(f'{STORAGE_ROOT_PATH}/data/input'):
    os.makedirs(f'{STORAGE_ROOT_PATH}/data/input')     
shutil.copy(f'{curr_data}/sample/input/AR_2022-23_page-14-17.pdf',
            f'{STORAGE_ROOT_PATH}/data/input/AR_2022-23_page-14-17.pdf')
shutil.copytree(f'{curr_data}/sample/config',f'{STORAGE_ROOT_PATH}/data/config',
                dirs_exist_ok=True)

#### Initialize Client Config

In [None]:
storage_config_data = infy_fs_utils.data.StorageConfigData(
        **{
            "storage_root_uri": f"file://{STORAGE_ROOT_PATH}",
            "storage_server_url": "",
            "storage_access_key": "",
            "storage_secret_key": ""
        })
client_config_data = infy_dpp_sdk.ClientConfigData(
    **{
        "container_data": {
            "container_root_path": f"{CONTAINER_ROOT_PATH}",
        }
    })
file_sys_handler = infy_fs_utils.provider.FileSystemHandler(
    storage_config_data)
if not infy_fs_utils.manager.FileSystemManager().has_fs_handler(
    infy_dpp_sdk.common.Constants.FSH_DPP):
    infy_fs_utils.manager.FileSystemManager().add_fs_handler(
        file_sys_handler,
        infy_dpp_sdk.common.Constants.FSH_DPP)
infy_dpp_sdk.ClientConfigManager().load(client_config_data)

#### Initialize Logging

In [None]:
logging_config_data = infy_fs_utils.data.LoggingConfigData(
        **{
            # "logger_group_name": "my_group_1",
            "logging_level": 10,
            "logging_format": "",
            "logging_timestamp_format": "",
            "log_file_data": {
                "log_file_dir_path": "/logs",
                "log_file_name_prefix": "hybrid",
                # "log_file_name_suffix": "1",
                "log_file_extension": ".log"

            }})

In [None]:
if not infy_fs_utils.manager.FileSystemLoggingManager().has_fs_logging_handler(
    infy_dpp_sdk.common.Constants.FSLH_DPP):
    infy_fs_utils.manager.FileSystemLoggingManager().add_fs_logging_handler(
            infy_fs_utils.provider.FileSystemLoggingHandler(
                logging_config_data, file_sys_handler),
            infy_dpp_sdk.common.Constants.FSLH_DPP)

### Indexing to VectorDb & Sparse Index

In [None]:
INDEX_INPUT_CONFIG_FILE_PATH = '/data/config/dpp_hybrid_search_indexing_input_config.json'

In [None]:
# Show pipeline card
PipelineHelper(INDEX_INPUT_CONFIG_FILE_PATH, STORAGE_ROOT_PATH, CONTAINER_ROOT_PATH).show_pipeline_card()

In [None]:
dpp_orchestrator = infy_dpp_sdk.orchestrator.OrchestratorNative(
        input_config_file_path=INDEX_INPUT_CONFIG_FILE_PATH)

In [None]:
processor_response_list = dpp_orchestrator.run_batch()

<div  style="line-height: 1;">
    <span style="color:Green"><b>NOTE: </b> The results of the pipeline will be available in <i>processor_response_data.json</i> file at <i>work_folder_path</i>.</span></div>

In [None]:
print("Output of indexing pipeline is available at below location: ")
print(json.dumps(processor_response_list[0].
                 context_data.get('request_closer'),indent=4))

### Hybrid Search Visualizer

#### Retrieval 

In [None]:
HYBRID_INPUT_CONFIG_FILE_PATH = '/data/config/dpp_hybrid_search_retriever_input_config.json'

In [None]:
ss_visualizer = SemanticSearchVisualizer()


def form_submit_button_clicked(_):
    user_query = ss_visualizer.get_input_text()

    ss_visualizer.set_output_text('Fetching. Please wait...')

    input_config_data = json.loads(file_sys_handler.read_file(
        HYBRID_INPUT_CONFIG_FILE_PATH))
    input_config_data['processor_input_config']['QueryRetriever']['queries'\
                                                                 ][0]['question']=user_query
    input_config_data['processor_list'][1]['enabled']=False
    print("Reader is enabled:",input_config_data['processor_list'][1]['enabled'])
    file_sys_handler.write_file(HYBRID_INPUT_CONFIG_FILE_PATH, json.dumps(
                                input_config_data, indent=4))

    dpp_orchestrator = infy_dpp_sdk.orchestrator.OrchestratorNativeBasic(
        input_config_file_path=HYBRID_INPUT_CONFIG_FILE_PATH)
    processor_response_list = dpp_orchestrator.run_batch()

    queries_list = processor_response_list[0].context_data.get(
        "query_retriever").get("queries")
    output = "<table style='border-collapse: collapse; width: 100%;'>"
    output += "<tr><th style='border: 1px solid black; width: 5%; text-align: center;'><b>Rank</b></th><th style='border: 1px solid black; width: 23.75%; text-align: center;'><b>Vector</b></th><th style='border: 1px solid black; width: 23.75%; text-align: center;'><b>Sparse</b></th><th style='border: 1px solid black; width: 23.75%; text-align: center;'><b>RRF</b></th></tr>"
    max_length = 0
    for query in queries_list:
        top_k_matches = query['top_k_matches']
        sparseindex_list, vectordb_list, rrf_list = [], [], []
        for match in top_k_matches:
            if 'sparseindex' in match:
                sparseindex_list.extend(match['sparseindex'])
            elif 'vectordb' in match:
                vectordb_list.extend(match['vectordb'])
            elif 'rrf' in match:
                rrf_list.extend(match['rrf'])
                
        current_max = max(len(sparseindex_list), len(vectordb_list), len(rrf_list))
        if current_max > max_length:
            max_length = current_max
            
        rank = 1
        for i in range(max_length):
            row = "<tr>"
            row += f"<td style='border: 1px solid black; height: 100px; text-align: center;'>{rank}</td>"
            for match_list in [vectordb_list, sparseindex_list, rrf_list]:
                if i < len(match_list):
                    row += f"<td style='border: 1px solid black; height: 100px;'>{generate_html_for_matches([match_list[i]])}</td>"
                else:
                    row += "<td style='border: 1px solid black; height: 100px;'></td>"
            row += "</tr>"
            output += row
            rank += 1

    output += "</table>"
    ss_visualizer.set_output_text(output)

colors = [
    "#e32636", "#a4c639", "#5d8aa8", "#efdecd", "#ffbf00", 
    "#008080", "#008000", "#7fffd4", "#b2beb5", "#915c83",
    "#A52A2A", "#FFFFF0", "#FFC0CB", "#9966cc", "#cd9575",
    "#4682B4", "#FA8072", "#FF69B4", "#32CD32", "#DAA520"
]
chunk_id_to_color = {}
def assign_color_to_chunk_id(chunk_id):
    if chunk_id not in chunk_id_to_color:
        chunk_id_to_color[chunk_id] = colors[len(chunk_id_to_color) % len(colors)]
    return chunk_id_to_color[chunk_id]

def generate_html_for_matches(match_list):
    html_output = ""
    for match in match_list:
        if 'meta_data' in match and match['meta_data'] and 'chunk_id' in match['meta_data'] and match['meta_data']['chunk_id'] != '':
            chunk_id = match['meta_data']['chunk_id']
            color = assign_color_to_chunk_id(chunk_id) 
        else:
            continue 
        if 'content' in match and match['content'] != '':
            content = match['content']
        else:
            continue 
        score = match['score'] if 'score' in match else 'N/A'

        html_output += f"""
        <div style='background-color: {color}; padding: 10px;'>
            <div style='text-align: left; margin: 0; padding: 0;'><b>ChunkId:</b> {chunk_id}</div>
            <details style='text-align: left; margin: 0; padding: 0;'>
                <summary style='margin: 0; padding: 0;'><b>Content:</b></summary>
                <div style='margin: 0; padding: 0;'>{content}</div>
            </details>
            <div style='text-align: left; margin: 0; padding: 0; white-space: nowrap;'><b>Score:</b> {score}</div>
        </div>
        """
    return html_output


def count_tokens(text):
    count = len(text)
    return count


help_html = """
**Sample Question(s):**   
What is the percentage of women employees?  
What is the operating margin?  
What is the percentage increase in return of equity from last fiscal year?  
What is the percent YoY growth and cc growth in revenue?    
What did the CEO and his global leadership team learn early on?    
What rating did infosys secure on the MSCI esg assessment?    
<hr/>
"""

display(Markdown(help_html))
ss_visualizer.on_form_submit_callback(form_submit_button_clicked)
ss_visualizer.set_token_counter_fn(count_tokens)
ss_visualizer.show_ui()

#### Search

In [None]:
ss_visualizer_search = SemanticSearchVisualizer()


def form_submit_button_clicked_search(_):
    user_query = ss_visualizer_search.get_input_text()
    vector_response, sparse_response, rrf_response = None, None, None

    ss_visualizer_search.set_output_text('Fetching. Please wait...')

    input_config_data = json.loads(file_sys_handler.read_file(
        HYBRID_INPUT_CONFIG_FILE_PATH))
    input_config_data['processor_input_config']['QueryRetriever']['queries'\
                                                                 ][0]['question']=user_query
    input_config_data['processor_list'][1]['enabled']=True
    print("Reader is enabled:",input_config_data['processor_list'][1]['enabled'])
    
    #Vector Search
    update_storage_configs(input_config_data,enable_vectordb=True, enable_sparse=False)
    file_sys_handler.write_file(HYBRID_INPUT_CONFIG_FILE_PATH, json.dumps(input_config_data, indent=4))
    dpp_orchestrator = infy_dpp_sdk.orchestrator.OrchestratorNativeBasic(input_config_file_path=HYBRID_INPUT_CONFIG_FILE_PATH)
    processor_response_list = dpp_orchestrator.run_batch()
    print("processor_response_list",processor_response_list)
    try:
        vector_response = processor_response_list[0].context_data.get("reader")["output"][0]["model_output"]["answer"]
    except (TypeError, KeyError):
        vector_response = processor_response_list[0].context_data.get("reader")["output"][0]["model_output"]
    
    #Sparse Search    
    update_storage_configs(input_config_data,enable_vectordb=False,enable_sparse=True)
    file_sys_handler.write_file(HYBRID_INPUT_CONFIG_FILE_PATH, json.dumps(input_config_data, indent=4))
    dpp_orchestrator = infy_dpp_sdk.orchestrator.OrchestratorNativeBasic(input_config_file_path=HYBRID_INPUT_CONFIG_FILE_PATH)
    processor_response_list = dpp_orchestrator.run_batch()
    queries_list = processor_response_list[0].context_data.get(
    "query_retriever").get("queries")
    print("query", queries_list)
    try:
        sparse_response = processor_response_list[0].context_data.get("reader")["output"][0]["model_output"]["answer"]
    except (TypeError, KeyError):
        sparse_response = processor_response_list[0].context_data.get("reader")["output"][0]["model_output"]
    
    #RRF Search             
    update_storage_configs(input_config_data,enable_vectordb=True,enable_sparse=True)
    file_sys_handler.write_file(HYBRID_INPUT_CONFIG_FILE_PATH, json.dumps(input_config_data, indent=4))
    dpp_orchestrator = infy_dpp_sdk.orchestrator.OrchestratorNativeBasic(input_config_file_path=HYBRID_INPUT_CONFIG_FILE_PATH)
    processor_response_list = dpp_orchestrator.run_batch()
    try:
        rrf_response = processor_response_list[0].context_data.get("reader")["output"][0]["model_output"]["answer"]
    except (TypeError, KeyError):
        rrf_response = processor_response_list[0].context_data.get("reader")["output"][0]["model_output"]                 
        
        
    output = "<table style='border-collapse: collapse; width: 100%;'>"
    output += "<tr><th style='border: 1px solid black; width: 25%; text-align: center;'><b>Vector</b></th><th style='border: 1px solid black; width: 25%; text-align: center;'><b>Sparse</b></th><th style='border: 1px solid black; width: 25%; text-align: center;'><b>RRF</b></th></tr>"
    output += f"<tr><td style='border: 1px solid black; text-align: center; vertical-align: top;'>{vector_response}</td><td style='border: 1px solid black; text-align: center; vertical-align: top;'>{sparse_response}</td><td style='border: 1px solid black; text-align: center; vertical-align: top;'>{rrf_response}</td></tr>"
    output += "</table>"
    ss_visualizer_search.set_output_text(output)
    
def update_storage_configs(input_config_data,enable_vectordb,enable_sparse ):
    for storage_key, storage_value in input_config_data['processor_input_config']['Reader']['storage'].items():
        if storage_key == 'vectordb':
            for e_key in storage_value:
                storage_value[e_key]['enabled'] = enable_vectordb
        elif storage_key == 'sparseindex':
            for e_key in storage_value:
                storage_value[e_key]['enabled'] = enable_sparse    

def count_token_search(text):
    # Custom token count logic to be added here. Sample below.
    count = len(text)
    return count


help_html = """
**Sample Question(s):**   
What is the percentage of women employees?  
What is the operating margin?  
What is the percentage increase in return of equity from last fiscal year?
What is the percent YoY growth and cc growth in revenue?  
What did the CEO and his global leadership team learn early on?  
What rating did infosys secure on the MSCI esg assessment?  
<hr/>
"""

display(Markdown(help_html))
ss_visualizer_search.on_form_submit_callback(form_submit_button_clicked_search)
ss_visualizer_search.set_token_counter_fn(count_token_search)
ss_visualizer_search.show_ui()