In [18]:
from vectordb_processing.utils import (
    connection_string_create,
    create_db,
    documents_in_folder,
    custom_document_loader,
    create_nodes,
    vector_store_create,
    populated_tables,
    index_load,
    query_engine_create
)
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core import PromptTemplate
from pprint import pprint
import json
from tools.llm_select import gemini_flash

In [19]:
llm, embed_model = gemini_flash(api_path="/home/johan/Code/secrets/google_api.json")

The online model has been selected 💸


In [20]:
naming = "dd"
db_setup = {
    "llm": {
        "model": "models/gemini-1.5-flash-latest",
        "host": "cloud",
        "input_price": 0.35,
        "output_price": 1.05,
    },
    "vdb": {
        "db_name": f"embed001_{naming}_doc",
        "embedding": "models/embedding-001",
        "embed_dim": 768,
        "context_window": 8192,
        "embed_length": None,
        "chunk_size": 512,
        "chunk_overlap": 128,
        "protocol": "postgres",
        "hostname": "localhost",
        "username": "test1",
        "password": "test1",
        "port": 5432,
        "database": "postgres",
        "table_name": naming,
    },
}

In [21]:
connection_postgres = connection_string_create(
    protocol=db_setup["vdb"]["protocol"],
    hostname=db_setup["vdb"]["hostname"],
    username=db_setup["vdb"]["username"],
    password=db_setup["vdb"]["password"],
    port=db_setup["vdb"]["port"],
    database=db_setup["vdb"]["database"],
)
print(connection_postgres)

postgres://test1:test1@localhost:5432/postgres


In [22]:
create_db(
    db_name=db_setup["vdb"]["db_name"],
    connection_string=connection_postgres,
    auto_commit=True,
    rebuild=False,
)

Connecting to: postgres://test1:test1@localhost:5432/postgres
embed001_dd_doc exists
Vector extenstion has been created or exists


In [23]:
file_names, file_paths = documents_in_folder(
    "/home/johan/Code/crewai_quarterly/bin/data/questions", 200
)
pprint(file_names)
pprint(file_paths)
print(len(file_names))

documents = custom_document_loader(file_paths)

['ILPA_Due_Diligence_Questionnaire_v1.2.pdf']
['/home/johan/Code/crewai_quarterly/bin/data/questions/ILPA_Due_Diligence_Questionnaire_v1.2.pdf']
1


Procesing documents: 100%|██████████| 1/1 [00:00<00:00,  8.94 file/s]


In [24]:
nodes = create_nodes(documents=documents, cover_pg_date=False)
pprint(nodes)

[TextNode(id_='87cac178107b0062bda4f533c192dff9c2fca3e13f97f33b3d50eba4e213bd9f', embedding=None, metadata={'total_pages': 29, 'file_path': '/home/johan/Code/crewai_quarterly/bin/data/questions/ILPA_Due_Diligence_Questionnaire_v1.2.pdf', 'source': '1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Due Diligence QuestionnaireVersion 1.2Most-Recently Revised in September 2018', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 TextNode(id_='88b12e78b083abfbcc6f1aedca49d7b5536a7d06b977c66e58bd1d088a2da61d', embedding=None, metadata={'total_pages': 29, 'file_path': '/home/johan/Code/crewai_quarterly/bin/data/questions/ILPA_Due_Diligence_Questionnaire_v1.2.pdf', 'source': '2'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Table of Contents \nOverview \n...................................

In [25]:
vector_store = PGVectorStore.from_params(
    database=db_setup["vdb"]["db_name"],
    host=db_setup["vdb"]["hostname"],
    password=db_setup["vdb"]["password"],
    port=db_setup["vdb"]["port"],
    user=db_setup["vdb"]["username"],
    table_name=db_setup["vdb"]["table_name"],
    embed_dim=db_setup["vdb"]["embed_dim"],
    hybrid_search=True,
    text_search_config="english",
)

In [26]:
connection_rag = connection_string_create(
    protocol=db_setup["vdb"]["protocol"],
    hostname=db_setup["vdb"]["hostname"],
    username=db_setup["vdb"]["username"],
    password=db_setup["vdb"]["password"],
    port=db_setup["vdb"]["port"],
    database=db_setup["vdb"]["db_name"],
)

In [27]:
table_names = populated_tables(connection_string=connection_rag)
print(table_names)

['data_dd']


In [28]:
sotrage_context, hybrid_index = index_load(
    vdb_connection=connection_rag, vector_store=vector_store, nodes=nodes
)

The index has been LOADED


In [29]:
query_engine = hybrid_index.as_query_engine(
    vector_store_query_mode="hybrid",
    similarity_top_k=7,
    vector_store_kwargs={
        "ivfflat_probes": 10,
        "hnsw_ef_search": 300
    },
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.40)],
)

In [13]:
prompt_tmpl = PromptTemplate("""\
You are a professional investment analyst assisting a financial advisor who is doing a due dilligence. \n
---------------------\n
{context_str}
---------------------\n
Given the context information and not prior knowledge, answer the query.\n
Please write the answer as an investment professional who is writing a formal reponse to a question, being as detailed as possible,
explaining the thinking behind the answer. Do not reference any graphs.\n
Query: {query_str}
Answer: \
""")

In [14]:
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": prompt_tmpl}
)
prompts_dict = query_engine.get_prompts()
print(prompts_dict)

{'response_synthesizer:text_qa_template': PromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template='You are a professional investment analyst assisting a financial advisor who is doing a due dilligence. \n\n---------------------\n\n{context_str}\n---------------------\n\nGiven the context information and not prior knowledge, answer the query.\n\nPlease write the answer as an investment professional who is writing a formal reponse to a question, being as detailed as possible,\nexplaining the thinking behind the answer. Do not reference any graphs.\n\nQuery: {query_str}\nAnswer: '), 'response_synthesizer:refine_template': SelectorPromptTemplate(metadata={'prompt_type': <PromptType.REFINE: 'refine'>}, template_vars=['query_str', 'existing_answer', 'context_msg'], kwargs={}, output_parser=None, template_var_mappings={}, function_mappings={},

In [15]:
query_string = 'Please provide me with 5 important questions i should ask the fund managers?'

query_embedding = embed_model.get_query_embedding(query_string)
retrieval_response = query_engine.retrieve(
    query_string,
)

In [16]:
ret_scores = []
for ret_response in retrieval_response:
    ret_scores.append(ret_response.score)
pprint(ret_scores)
pprint(retrieval_response)

[0.675215626381406, 0.6731123959961128, 0.6712644881257158, 0.6621815373817636]
[NodeWithScore(node=TextNode(id_='88b12e78b083abfbcc6f1aedca49d7b5536a7d06b977c66e58bd1d088a2da61d', embedding=None, metadata={'total_pages': 29, 'file_path': '/home/johan/Code/crewai_quarterly/bin/data/questions/ILPA_Due_Diligence_Questionnaire_v1.2.pdf', 'source': '2'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='The ILPA Due Diligence \nQuestionnaire (DDQ) reflects the view of the participants involved in the creation thereof as to best \npractices with respect to fund diligence. However, no limited partner should utilize the DDQ as a \nsubstitute for its own determination as to what information such limited partner will need or desire \nwith respect to any particular investment. Further, no representation is made that the DDQ, when \nprovided by general partners to their prospective investor base, will include all desirable information \nor will be fully inclu

In [17]:
ask = True
if ask:
    response = query_engine.query(
        query_string,
        # streaming=True,
    )
    pprint(response)

Response(response='Based on the provided ILPA Due Diligence Questionnaire '
                  '(DDQ), here are five important questions to ask the fund '
                  'managers during due diligence:\n'
                  '\n'
                  '1. **Fund Strategy and Performance:** \n'
                  '    * **Question:**  "Can you provide a detailed breakdown '
                  'of your investment strategy, including specific sectors, '
                  'geographies, and investment themes?  Please also share your '
                  'historical performance data, including both realized and '
                  'unrealized returns, for your previous funds.  How do you '
                  'measure and manage risk within your portfolio?"\n'
                  '    * **Rationale:** This question delves into the core of '
                  "the fund's investment approach. Understanding the strategy, "
                  'historical performance, and risk management practices is '
     