In [None]:
# ===============================================================================================================#
# Copyright 2024 Infosys Ltd.                                                                          #
# Use of this source code is governed by Apache License Version 2.0 that can be found in the LICENSE file or at  #
# http://www.apache.org/licenses/                                                                                #
# ===============================================================================================================#

## UC_41 - Retrieval_online from vectordb 
<div  style="line-height: 1;">
    <span style="color:Green">Demonstrates retrieval without dependency on group_request_file.</span></div>

#### Import libraries

In [None]:
import os
import json
import shutil
import pandas as pd
import infy_dpp_sdk
import infy_fs_utils
from _internal_utils.pipeline_helper import PipelineHelper

#### Set environment variables
<div  style="line-height: 1;">
    <span style="color:Red"><b>NOTE:</b> The Pipeline uses environment variables which needs to be set by the developer.<br>
In production developer needs to set them as required.<br>
In this notebook you can provide them using the below code.<br>
To set or change the value please refer <i>installation.ipynb</i></span>
</div>

In [None]:
%store -r OPENAI_KEY
os.environ['OPENAI_KEY'] = OPENAI_KEY
%store -r OPENAI_SERVER_URL
os.environ['OPENAI_SERVER_URL']=OPENAI_SERVER_URL

#### Define configuration file path

In [None]:
STORAGE_ROOT_PATH = 'C:/DPP/infy_libraries_client/STORAGE'
CONTAINER_ROOT_PATH = 'C:/DPP/infy_libraries_client/CONTAINER'
PIPELINE_INPUT_CONFIG_FILE_PATH = '/data/config/dpp_pipeline_retriever_online_input_config.json'

#### Copying files
<div  style="line-height: 1;"><span style="color:Red"><b>NOTE: </b>In this notebook below is used to copy sample files to folders in <i>STORAGE_ROOT_PATH</i>.<br>
In production the data and config files should kept under respective folders in <i>STORAGE_ROOT_PATH </i>.<br>
<span></div>

In [None]:
curr_data = os.path.abspath('./data')
if not os.path.exists(f'{STORAGE_ROOT_PATH}/data'):
    os.makedirs(f'{STORAGE_ROOT_PATH}/data')
if not os.path.exists(f'{STORAGE_ROOT_PATH}/data/input'):
    os.makedirs(f'{STORAGE_ROOT_PATH}/data/input')     
shutil.copytree(f'{curr_data}/sample/config',f'{STORAGE_ROOT_PATH}/data/config',
                dirs_exist_ok=True)

#### Initialize Client Config

In [None]:
storage_config_data = infy_fs_utils.data.StorageConfigData(
        **{
            "storage_root_uri": f"file://{STORAGE_ROOT_PATH}",
            "storage_server_url": "",
            "storage_access_key": "",
            "storage_secret_key": ""
        })

client_config_data = infy_dpp_sdk.ClientConfigData(
    **{
        "container_data": {
            "container_root_path": f"{CONTAINER_ROOT_PATH}",
        }
    })
file_sys_handler = infy_fs_utils.provider.FileSystemHandler(
    storage_config_data)
if not infy_fs_utils.manager.FileSystemManager().has_fs_handler(
    infy_dpp_sdk.common.Constants.FSH_DPP):
    infy_fs_utils.manager.FileSystemManager().add_fs_handler(
        file_sys_handler,
        infy_dpp_sdk.common.Constants.FSH_DPP)
infy_dpp_sdk.ClientConfigManager().load(client_config_data)

#### Initialize Logging

In [None]:
logging_config_data = infy_fs_utils.data.LoggingConfigData(
        **{
            # "logger_group_name": "my_group_1",
            "logging_level": 10,
            "logging_format": "",
            "logging_timestamp_format": "",
            "log_file_data": {
                "log_file_dir_path": "/logs",
                "log_file_name_prefix": "retriever",
                # "log_file_name_suffix": "1",
                "log_file_extension": ".log"

            }})

In [None]:
if not infy_fs_utils.manager.FileSystemLoggingManager().has_fs_logging_handler(
    infy_dpp_sdk.common.Constants.FSLH_DPP):
    infy_fs_utils.manager.FileSystemLoggingManager().add_fs_logging_handler(
            infy_fs_utils.provider.FileSystemLoggingHandler(
                logging_config_data, file_sys_handler),
            infy_dpp_sdk.common.Constants.FSLH_DPP)

### Query(Picked from config file)

In [None]:
input_config_data=json.loads(file_sys_handler.read_file(
                    PIPELINE_INPUT_CONFIG_FILE_PATH))
print(input_config_data['processor_input_config']['QueryRetriever']['queries'][0]['question'])

## Run the Pipeline

In [None]:
# ---- Create response data -----
metadata = infy_dpp_sdk.data.MetaData(
    standard_data=infy_dpp_sdk.data.StandardData(
        filepath=infy_dpp_sdk.data.ValueData()))
document_data = infy_dpp_sdk.data.DocumentData(metadata=metadata)
context_data = {
}
response_data = infy_dpp_sdk.data.ProcessorResponseData(
    document_data=document_data, context_data=context_data)

document_data_json=json.loads(response_data.json(indent=4))

In [None]:
dpp_orchestrator = infy_dpp_sdk.orchestrator.OrchestratorNativeBasic(
        input_config_file_path=PIPELINE_INPUT_CONFIG_FILE_PATH)

In [None]:
processor_response_list = dpp_orchestrator.run_batch(
                    [infy_dpp_sdk.data.DocumentData(
                        **document_data_json.get('document_data'))],
                    [document_data_json.get('context_data')])

### Verify results

In [None]:
print("Output of retrieval pipeline is: ")

In [None]:
top_k_matches_list=processor_response_list[0].context_data.get(
    "query_retriever").get("queries")[0].get("top_k_matches")
match_list=[match for match_dict in top_k_matches_list for match_list in match_dict.values() for match in match_list]
data={
    "Content":[match.get("content") for match in match_list],
    "Score":[match.get("score") for match in match_list],
    "Metadata-page_no":[match.get("meta_data"
                                 ).get("page_no") for match in match_list],
    "Metadata-doc_name":[match.get("meta_data"
                                  ).get("doc_name") for match in match_list]
}
pd.DataFrame(data)

In [None]:
# print(json.dumps(processor_response_list[0].context_data.get("query_retriever"),indent=4))