In [1]:
import random
from src.configs.env_config import config
from src.services.db import chroma_service
from pathlib import Path
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from pprint import pprint
from langchain_core.prompts import PromptTemplate
from src.services.utils import (
    text_splitter_recursive_char,
    create_chunk_ids,
    json_to_documents,
)
from src.services.processors import DocumentsPreprocessing
from src.services.vectorstore import ChromaStore
from src.services.retrievers import MultiQRerankedRetriever

In [2]:
client = chroma_service()
client.heartbeat()

1744045750875475842

In [3]:
collection_name = "local_collection"

In [None]:
if client.get_collection(collection_name):
    client.delete_collection(collection_name)

collection = client.get_or_create_collection(collection_name)
collection

In [11]:
pdf_data_src = Path("_dev_nb/output_data/pdf_loader")
web_data_src = Path("_dev_nb/output_data/web_loader")

### classes


In [12]:
client.get_collection(collection_name).count()

0

In [13]:
json_path = web_data_src / "setics_stad_docs_clean.json"
docs = json_to_documents(filename=json_path)

print(f"Got {len(docs)} documents")

Got 525 documents


In [14]:
docs_copy1 = docs.copy()
random.shuffle(docs_copy1)
split_idx1 = random.randint(100, len(docs_copy1) - 1)
docs_part1 = docs_copy1[:split_idx1]

docs_copy2 = docs.copy()
random.shuffle(docs_copy2)
split_idx2 = random.randint(100, len(docs_copy2) - 1)
docs_part2 = docs_copy2[:split_idx2]

print(f"Dataset 1 -> Part 1: {len(docs_part1)} | Part 2: {len(docs_part2)}")

Dataset 1 -> Part 1: 231 | Part 2: 362


In [15]:
processor = DocumentsPreprocessing()
chunks1, ids1 = await processor(documents=docs_part1)
chunks2, ids2 = await processor(documents=docs_part2)

print(f"Created {len(chunks1)} and {len(chunks2)} chunks")

Created 500 and 812 chunks


In [16]:
store = ChromaStore()

In [17]:
added_count, skipped_count, skipped_sources = await store.add_documents(
    documents=chunks1, ids=ids1, collection_name=collection_name
)

print(f"Added {added_count} chunks to the collection")
print(f"Skipped {skipped_count} chunks")
print(f"Skipped sources: {skipped_sources}")

Added 500 chunks to the collection
Skipped 0 chunks
Skipped sources: []


In [18]:
client.get_collection(collection_name).count()

500

In [19]:
added_count, skipped_count, skipped_sources = await store.add_documents(
    documents=chunks2, ids=ids2, collection_name=collection_name
)

print(f"Added {added_count} chunks to the collection")
print(f"Skipped {skipped_count} chunks")
print(f"Skipped sources: {skipped_sources}")

Added 494 chunks to the collection
Skipped 318 chunks
Skipped sources: ['https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/cabling-diagram', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/import-infrastructure-data', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/property-table', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/creation-tool', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/import-endpoints?q=import+engineering+rules', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/topology3', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/setics-sttar-data-model', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/cable-system-commands', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/network-service-areas-report', 'https://docs.setics-sttar.com/advanced-des

In [20]:
client.get_collection(collection_name).count()

994

In [21]:
processor = DocumentsPreprocessing()
chunks, ids = await processor(documents=docs)

print(f"Created {len(chunks)} chunks")

Created 1145 chunks


In [22]:
added_count, skipped_count, skipped_sources = await store.add_documents(
    documents=chunks, ids=ids, collection_name=collection_name
)

print(f"Added {added_count} chunks to the collection")
print(f"Skipped {skipped_count} chunks")
print(f"Skipped sources: {skipped_sources}")

Added 151 chunks to the collection
Skipped 994 chunks
Skipped sources: ['https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/topology', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/endpoint-support-context-menu', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/search-by-cost-effectiveness-using-actual-costs', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/naming-rules-syntax', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/duct-assembly-datasheet', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/cable-system-commands', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/splicing-plans-options', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/start-network-optimization', 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/fr/topic/installing-a-workstation-license', 'https:/

In [23]:
client.get_collection(collection_name).count()

1145

In [24]:
query = "What is the purpose of the Advanced Designer?"

In [25]:
retriever = MultiQRerankedRetriever()
results = await retriever(query=query, collection_name=collection_name)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:langchain.retrievers.multi_query:Generated queries: ['What are the main functions and objectives of the Advanced Designer?  ', 'How does the Advanced Designer contribute to its field or industry?  ', 'Can you explain the role and significance of the Advanced Designer in design processes?']
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:8000/api/v2/tenants/default_tenant/databases/default_database/collections/e83ff1ff-b3c7-428c-a85b-59afcf629143/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:8000/api/v2/tenants/default_tenant/databases/default_database/collections/e83ff1ff-b3c7-428c-a85b-59afcf629143/query "HTTP/1.1 20

In [26]:
for result in results:
    pprint(f"Metadata: {result.metadata}")
    print("-" * 80)

("Metadata: {'id': 'import-engineering-rules-739-bd4de5dc', 'relevance_score': "
 "0.9990605, 'language': 'en', 'source': "
 "'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/import-engineering-rules', "
 "'description': '', 'title': 'Search - Setics Sttar Advanced Designer  |  "
 "User Manual - Version 2.3'}")
--------------------------------------------------------------------------------
("Metadata: {'id': 'define-engineering-rules-483-3b2e7cb1', 'relevance_score': "
 "0.998524, 'title': 'Define Engineering Rules - Setics Sttar Advanced "
 "Designer  |  User Manual - Version 2.3', 'description': 'Specifying "
 'engineering rules allows you to control how Setics Sttar Advanced Designer '
 "will automatically create the architecture and size the network. A...', "
 "'source': "
 "'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/define-engineering-rules', "
 "'language': 'en'}")
-----------------------------------------------------------

In [27]:
print(results[0])

page_content='!To edit the cable system a Sttar Advanced Designer is…

Project Configuration File

EXPORT NETWORK DATA » Project Configuration File

 You can save the project in two different formats: *.sdproj and *.sdconfig. Both formats allow you to save the project settings, however they differ on how the data is saved.

The *.sdproj format independently contains all the data, so it can be used completely…

Cable System Tab

USER INTERFACE » Cable System Tab

 The Cable System window allows you to set level by level, support by support, the cable sizing rules.

 Main window
 Commands and Filters
 Splicing properties
 Cable Properties
 Available Cables
 Available Equipment

!To edit the cable system a Sttar Advanced…

Preprocess with FME

APPENDICES » Setting of Filters and Preprocessing of Input Data » Preprocess with FME' metadata={'id': 'import-engineering-rules-739-bd4de5dc', 'relevance_score': 0.9990605, 'language': 'en', 'source': 'https://docs.setics-sttar.com/advanced-designe

### prototyping


In [None]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    api_key=config.OPENAI_API_KEY,
    max_tokens=1000,
)

In [None]:
pdf_json_path = pdf_data_src / "xplore_pdf_3_clean.json"
pdf_docs = json_to_documents(filename=pdf_json_path)
len(pdf_docs)

In [None]:
web_json_path = web_data_src / "setics_stad_docs_clean.json"
web_docs = json_to_documents(filename=web_json_path)
len(web_docs)

In [None]:
web_json_path_2 = web_data_src / "setics_stpl_docs_clean.json"
web_docs_2 = json_to_documents(filename=web_json_path_2)
len(web_docs_2)

In [None]:
img_json_path = web_data_src / "setics_stad_img_docs.json"
img_docs = json_to_documents(filename=img_json_path)
len(img_docs)

In [None]:
# for i, doc in enumerate(pdf_docs):
#     print(f"Doc {i}: length {len(doc.page_content)}")

In [None]:
pdf_chunks = text_splitter_recursive_char(pdf_docs)
len(pdf_chunks)

In [None]:
# for i, doc in enumerate(web_docs):
#     print(f"Doc {i}: length {len(doc.page_content)}")

In [None]:
web_chunks = text_splitter_recursive_char(web_docs)
len(web_chunks)

In [None]:
web_chunks_2 = text_splitter_recursive_char(web_docs_2)
len(web_chunks_2)

In [None]:
# for i, doc in enumerate(web_chunks):
#     print(f"Doc {i}: length {len(doc.page_content)}")

In [None]:
pdf_chunks_ids = create_chunk_ids(pdf_chunks)
web_chunks_ids = create_chunk_ids(web_chunks)
web_chunks_2_ids = create_chunk_ids(web_chunks_2)

img_ids = [i.metadata["id"] for i in img_docs]

print(
    pdf_chunks_ids[:2], web_chunks_ids[:2], web_chunks_2_ids[:2], img_ids[:2], sep="\n"
)

In [None]:
# pprint(web_chunks[10].metadata)

In [None]:
openai_embedding = OpenAIEmbeddings(
    model="text-embedding-3-large", openai_api_key=config.OPENAI_API_KEY
)

In [None]:
vector_store = Chroma(
    client=client,
    collection_name=collection.name,
    embedding_function=openai_embedding,
)

In [None]:
documents_with_ids = [
    (web_chunks, web_chunks_ids),
    (pdf_chunks, pdf_chunks_ids),
    (web_chunks_2, web_chunks_2_ids),
    (img_docs, img_ids),
]

for docs, ids in documents_with_ids:
    vector_store.add_documents(documents=docs, ids=ids)

In [None]:
collection.count()

In [None]:
# retriever = vector_store.as_retriever(
#     search_type="mmr",
#     # search_type="similarity_score_threshold",
#     # search_kwargs={"k": 3, "score_threshold": 0.5},
#     search_kwargs={"k": 3},
# )

In [None]:
# retriever = MultiQueryRetriever.from_llm(retriever=vector_store.as_retriever(), llm=llm)

# retriever = SelfQueryRetriever.from_llm(
#     llm=llm,
#     vectorstore=vector_store,
# )

In [None]:
# wrapping base retriever with FlashRank compressor

# create MultiQueryRetriever
base_retriever = vector_store.as_retriever(search_kwargs={"k": 10})
multi_query_retriever = MultiQueryRetriever.from_llm(retriever=base_retriever, llm=llm)

# add reranker on top
compressor = FlashrankRerank(top_n=3)
retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=multi_query_retriever
)

In [None]:
# query = "What is the installation requirement for flower pot?"
# query = "What can you tell me about Setics Sttar?"
# query = "In Sttar, how to add a new infrastructure layer?"
# query = "In Sttar, how to manually split some lines in the interface, in the map view?"
# query = " In sttar, how can we manage the support properties, for the reusable infrastructure?"
query = " What is the differences between the advanced designer and the planner?"

In [None]:
# # Set logging for the queries
# import logging

# logging.basicConfig()
# logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [None]:
results = retriever.invoke(query)
results

In [None]:
# for result in results:
#     print(result.page_content)
#     print("\n\n===\n\n")

### chatbot


In [28]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    api_key=config.OPENAI_API_KEY,
    # max_tokens=1000,
)

In [29]:
template = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use three sentences maximum and keep the answer concise.
Context: {context}
Question: {question}
Answer:"""

prompt = PromptTemplate.from_template(template)

docs_content = "\n\n".join(doc.page_content for doc in results)

messages = prompt.invoke({"question": query, "context": docs_content})
response = llm.invoke(messages)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [30]:
pprint(response.content)

('The purpose of the Sttar Advanced Designer is to automatically create the '
 'architecture and size the network by applying specified engineering rules. '
 'It organizes the access network as a hierarchical tree structure, with a '
 'central office at the top and interconnected nodes managing specific service '
 'areas. This tool facilitates efficient project configuration and cable '
 'system management.')


In [32]:
collection = client.get_collection(collection_name)
collection.get(
    ids=results[0].metadata["id"], include=["documents", "metadatas", "embeddings"]
)

INFO:httpx:HTTP Request: GET http://localhost:8000/api/v2/tenants/default_tenant/databases/default_database/collections/local_collection "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:8000/api/v2/tenants/default_tenant/databases/default_database/collections/e83ff1ff-b3c7-428c-a85b-59afcf629143/get "HTTP/1.1 200 OK"


{'ids': ['import-engineering-rules-739-bd4de5dc'],
 'embeddings': array([[ 0.00307267,  0.0403012 , -0.01763814, ..., -0.00069231,
          0.00238089, -0.02035431]]),
 'metadatas': [{'id': 'import-engineering-rules-739-bd4de5dc',
   'title': 'Search - Setics Sttar Advanced Designer  |  User Manual - Version 2.3',
   'description': '',
   'source': 'https://docs.setics-sttar.com/advanced-designer-user-manual/2.3/en/topic/import-engineering-rules',
   'language': 'en'}],
 'documents': ['!To edit the cable system a Sttar Advanced Designer is…\n\nProject Configuration File\n\nEXPORT NETWORK DATA » Project Configuration File\n\n You can save the project in two different formats: *.sdproj and *.sdconfig. Both formats allow you to save the project settings, however they differ on how the data is saved.\n\nThe *.sdproj format independently contains all the data, so it can be used completely…\n\nCable System Tab\n\nUSER INTERFACE » Cable System Tab\n\n The Cable System window allows you to se