In [1]:
%load_ext autoreload
%autoreload 2

In [37]:
#import libraries
import json
import os
from annoy import AnnoyIndex
import re
import gradio as gr

In [3]:
import utils.markdown as markdown
import utils.file_utils as file_utils
import utils.cohere as cohere
import utils.openai as openai
import utils.openrouter as openrouter
import utils.token as token
import utils.embeddings as embeddings_util

In [7]:
temp_json_files = file_utils.walk_files("jsons", [".json"])
print(temp_json_files)
# remove files that do not have "pages" in data
json_files = []
for file in temp_json_files:
    with open(file, "r") as f:
        data = json.load(f)
        if "pages" in data:
            json_files.append(file)

project_names = []
for file in json_files:
    # get the project name from the file path
    project_name = os.path.basename(os.path.dirname(file))
    # if project_name not in project_names:
    project_names.append(project_name)
print(project_names)

['jsons/8607-8649-8675-8699-McHugh-Street/Application Form Plan of Subdivision or Condominium.pdf.json', 'jsons/8607-8649-8675-8699-McHugh-Street/Condominium Drawings.pdf.json', 'jsons/8607-8649-8675-8699-McHugh-Street/DPC Exemption Cover Letter (December 2021).pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Drawings.pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Storm Water Management Report (updated).pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Storm Water Management Report.pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - 1095 and 1185 North Talbot Survey.pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Acoustical Report.pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Bellocorp Development - North Talbot Tree Report.pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Draft Plan.pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Endangered Species Act Study.pdf.json', 'jsons/0-1095-1185-

In [None]:

project_name = "0-1095-1185-North-Talbot-Road"
project_files = None
index = None
chunks = None
urls = None
page_numbers = None

def get_project_names():
    # Use the existing `project_names` variable
    return sorted(list(set(project_names)))

def get_project_files(project_name):
    # Filter files based on the selected project name
    files = []
    for file, project in zip(json_files, project_names):
        if project == project_name:
            files.append(file)
    return files

def sanitize_file_name(file_name):
    # Replace unsafe characters with underscores
    return re.sub(r'[<>:"/\\|?*]', '_', file_name)

def load_project(proj):
    global project_name
    global project_files
    global index
    global chunks
    global urls
    global page_numbers
    print(f"loading project: {project_name}")
    project_name = proj
    # load project files
    project_files = get_project_files(project_name)
    print(f"project files: {project_files}")
    # load project index
    index = AnnoyIndex(384, 'angular')
    # /home/singh4u1/citywindsor/fast_embeddings/project/0-666-676-684-696-chatham-street-west_embedding_references.json
    # index.load(f"fast_embeddings/project/{project_name}_embeddings.ann")
    # load project chunks
    chunks = []
    urls = []
    page_numbers = []
    # urls are present in applications.json
    current_project_data = []
    
    with open("applications.json", 'r') as f:
        data = json.load(f)
        for project in data:
            if project['file_name'] == project_name:
                current_project_data = project
                break
    pdf_links = current_project_data['pdf_links']

    for file in project_files:
        num_chunks = 0
        with open(file, 'r') as f:
            data = json.load(f)
            num_chunks = len(data['pages'])
            for i in range(num_chunks):
                chunks.append(data['pages'][i]['markdown'])
            # chunks.append(data['pages'][0]['markdown'])
                page_numbers.append(i+1)
        # create embeddings for chunks
    # get urls from current_project_data
    # print(current_project_data)
    
    # for file in project_files:
        # print(file)
        # if file = jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Subdivision Application Signed Commissioned.pdf.json
        # file_name = SDN-001_21 - Subdivision Application Signed Commissioned
        file_name = os.path.basename(file).split(".")[0]
        # print(file_name)
        for pdf in pdf_links:
            # print(pdf)
            # print(pdf['name'])
            # print(f"{file_name}=={pdf['name']}")
            pdf_name = sanitize_file_name(pdf['name'])
            if file_name == pdf_name:
                for i in range(num_chunks):
                    urls.append(pdf['url'])
    print("urls")
    print(urls)
    embeddings = embeddings_util.generate_embeddings_fastembed(chunks)
        # add embeddings to index
    for i in range(len(embeddings)):
        index.add_item(i, embeddings[i])
    # build index
    index.build(10)




def chat_with_project(user_input, history):
    global project_name
    global index
    global chunks
    global urls
    global page_numbers
    # Simulate a response based on the project name and user input
    # response = f"Response for project '{project_name}': You said '{user_input}'"
    # chat_history.append((user_input, response))
    # return chat_history
    nearest_neighbors = index.get_nns_by_vector(embeddings_util.generate_embeddings_fastembed(user_input)[0], 20)
    documents = []
    print(len(chunks))
    print(nearest_neighbors)
    for i in range(len(nearest_neighbors)):
        documents.append({
            "id": f"{nearest_neighbors[i]}",
            "data": chunks[nearest_neighbors[i]],
        })
    system_prompt = """
    You are a helpful assistant that answers questions based on the context provided. 
    You are given a list of documents and a question. 
    Your task is to answer the question based on the context provided.
    Your responses should be concise and relevant to the question asked. Do not include any information that is not relevant to the question.
    """
    response = cohere.query(message = user_input, documents = documents, )
    response_text = response[0].message.content[0].text
    for citation in reversed(response[0].message.citations):
        # get the index
        ind = citation.sources[0].id
        # get the file and page number
        print(urls)
        url = urls[int(ind)]
        response_text = response_text.replace(citation.text, f"[{citation.text}]({url}#page={page_numbers[int(ind)]})")
    return response_text

def random_response(input, history):
    # Simulate a random response
    return f"Random response to: {input}"

load_project(project_name)
resp = chat_with_project("Who is the agent for this application?", None)
print(resp)

with gr.Blocks() as demo:
    project_selector = gr.Dropdown(
        choices=get_project_names(),
        label="Select a Project",
        interactive=True
    )
    chat_interface = gr.ChatInterface(
        fn=chat_with_project,
        type="messages"
    )
    project_selector.change(
        fn=load_project,
        inputs=project_selector,
        outputs=None
    )
demo.launch()


loading project: 0-1095-1185-North-Talbot-Road
project files: ['jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - 1095 and 1185 North Talbot Survey.pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Acoustical Report.pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Bellocorp Development - North Talbot Tree Report.pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Draft Plan.pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Endangered Species Act Study.pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Planning Rationale Report.pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Species At Risk (SAR).pdf.json', 'jsons/0-1095-1185-North-Talbot-Road/SDN-001_21 - Subdivision Application Signed Commissioned.pdf.json']
urls
['https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%201095%20and%201185%20North%20Talbot%20Survey.pdf', 'https://ww

[0;93m2025-05-10 00:26:52.160519844 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 00:26:52.160555852 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m
[0;93m2025-05-10 00:26:53.297804447 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 00:26:53.297836948 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


144
[130, 143, 139, 129, 138, 132, 131, 29, 133, 136, 140, 80, 68, 81, 142, 137, 134, 141, 135, 89]
[]
['https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%201095%20and%201185%20North%20Talbot%20Survey.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%20Acoustical%20Report.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%20Acoustical%20Report.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%20Acoustical%20Report.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%20Acoustical%20Report.pdf



[0;93m2025-05-10 00:27:09.669395567 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 00:27:09.669442946 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


144
[143, 130, 139, 138, 129, 132, 131, 29, 140, 133, 136, 142, 80, 81, 134, 137, 68, 141, 28, 135]
[]


[0;93m2025-05-10 00:27:34.655132272 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 00:27:34.655170945 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


144
[143, 130, 139, 138, 129, 132, 131, 29, 140, 133, 136, 142, 80, 81, 134, 137, 68, 141, 28, 135]
[]
['https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%201095%20and%201185%20North%20Talbot%20Survey.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%20Acoustical%20Report.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%20Acoustical%20Report.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%20Acoustical%20Report.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%20Acoustical%20Report.pdf

[0;93m2025-05-10 00:29:24.833177949 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 00:29:24.833224707 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


144
[143, 15, 131, 130, 133, 141, 129, 27, 122, 45, 47, 121, 135, 51, 134, 43, 38, 123, 56, 49]
[]
['https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%201095%20and%201185%20North%20Talbot%20Survey.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%20Acoustical%20Report.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%20Acoustical%20Report.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%20Acoustical%20Report.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/SDN-001_21%20-%20Acoustical%20Report.pdf', '

[0;93m2025-05-10 00:29:54.056151377 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 00:29:54.056188306 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m
[0;93m2025-05-10 00:30:06.834758808 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 00:30:06.834800536 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


31
[16, 7, 20, 6, 4, 2, 15, 9, 10, 12, 19, 18, 17, 11, 13, 3, 5, 8, 14, 0]
[]
['https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/CDM-004_22%20(CDM-6939)%20-%2001178-0312.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/CDM-004_22%20(CDM-6939)%20-%2001178-0312.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/CDM-004_22%20(CDM-6939)%20-%20Application%20for%20Plan%20of%20Condo.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/CDM-004_22%20(CDM-6939)%20-%20Application%20for%20Plan%20of%20Condo.pdf', 'https://www.citywindsor.ca/Documents/residents/planning/land-development/development-applications/current-development-applications/CDM-004_22%20(CDM-6