# Setting up requirements

In [66]:
!pip install cohere umap-learn altair annoy datasets tqdm



# Importing the necessary libraries

In [None]:
import os
import cohere
import numpy as np
import re
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
import umap
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
from annoy import AnnoyIndex
import warnings
import random

  from .autonotebook import tqdm as notebook_tqdm


# Reading a File

In [1]:
# read the following file
file_path="texts/0-666-676-684-696-chatham-street-west/Acoustical and Vibration Report.txt"
file_paths = [
    "texts/0-666-676-684-696-chatham-street-west/Acoustical and Vibration Report.txt",
    "texts/0-Clairview-Avenue/Environmental Evaluation Report.txt"
]
# read both files
texts = []
for path in file_paths:
    with open(path, 'r') as file:
        lines = file.readlines()
    # convert lines to a single string
    text = ''.join(lines)
    # trim starting and ending whitespace
    text = text.strip()
    texts.append(text)

# combine texts into a single string
text = ' '.join(texts)
# print first 100 characters of the combined string
print(text[:100])

ACOUSTICAL  AND VIBRATION REPORT  
696 CHATHAM  STREET WEST  
RESIDENTIAL CONDO DEVELOPMENT  
WINDSO


# Creating chunks

In [69]:
chunk_size = 1536
overlap = 128
chunks = []
for i in range(0, len(text), chunk_size - overlap):
    chunk = text[i:i + chunk_size]
    if len(chunk) == chunk_size:
        chunks.append(chunk)
    else:
        # pad the last chunk with spaces
        chunks.append(chunk + ' ' * (chunk_size - len(chunk)))
# print the number of chunks
print(f"Number of chunks: {len(chunks)}")

Number of chunks: 72


# Embedding Chunks

In [None]:
import cohere
# embed chunks
model_name = 'embed-v4.0'
api_key = os.getenv('COHERE_API_KEY')
input_type = 'search_document'
embedding_types = ['float']
embedding_dimensions = 1536

co = cohere.ClientV2(api_key)

response = co.embed(
    texts = chunks,
    model = model_name,
    input_type = input_type,
    embedding_types=['float'],
    output_dimension=embedding_dimensions,
)

print(response)
embeddings = response.embeddings
print(embeddings)


float_=[[0.014709473, -0.040527344, 0.036621094, 0.00024986267, -0.029052734, 0.0021362305, 0.026977539, 0.003250122, 0.011657715, 0.02722168, -0.01574707, -0.00018119812, 0.018798828, -0.021240234, 0.028564453, 0.03149414, 0.002380371, 0.017944336, 0.011108398, -0.026000977, 0.039794922, -0.029541016, -0.01940918, -0.008972168, -0.04321289, 0.0099487305, 0.022338867, -0.015625, 0.05053711, -0.0046081543, 0.0066223145, 0.040771484, -0.01965332, -0.005554199, -0.03125, 0.01574707, 0.014953613, -0.021118164, -0.008056641, 0.017456055, 0.008361816, -0.0087890625, -0.004699707, 0.00061035156, -0.010314941, 0.0038909912, 0.020019531, -0.033447266, 0.002029419, -0.019165039, 0.012268066, 0.0051879883, 0.0043945312, -0.026489258, 0.0026550293, 0.030517578, 0.013916016, -0.020385742, -0.017700195, 0.013122559, -0.017822266, -0.018798828, 0.026489258, 0.010375977, 0.0014190674, -0.03125, -0.009155273, 0.022705078, 0.025268555, -0.026855469, 0.011352539, -0.004760742, -0.044189453, -0.006134033,

# Embedding Entire Document

In [None]:
# embed the entire text
response_entire = co.embed(
    texts = [text],
    model = model_name,
    input_type = input_type,
    embedding_types=['float'],
    output_dimension=embedding_dimensions,
)
print(response_entire)
embeddings_entire = response_entire.embeddings.float




In [None]:
print(embeddings)

float_=[[0.014526367, -0.040527344, 0.037353516, 0.00015163422, -0.029296875, 0.0019989014, 0.02722168, 0.003189087, 0.01159668, 0.02734375, -0.01586914, -0.00020503998, 0.019165039, -0.021606445, 0.029296875, 0.03149414, 0.0026550293, 0.017944336, 0.011474609, -0.025512695, 0.040039062, -0.030151367, -0.01953125, -0.009216309, -0.04321289, 0.010131836, 0.022338867, -0.01574707, 0.049560547, -0.0047912598, 0.0068359375, 0.040771484, -0.01977539, -0.005432129, -0.03125, 0.016113281, 0.014770508, -0.021118164, -0.007873535, 0.016967773, 0.008361816, -0.008300781, -0.004760742, 0.00079345703, -0.009887695, 0.004486084, 0.020507812, -0.033203125, 0.0014038086, -0.019165039, 0.012329102, 0.0049743652, 0.0046081543, -0.026489258, 0.0028533936, 0.030151367, 0.01373291, -0.020385742, -0.017456055, 0.0126953125, -0.017700195, -0.018432617, 0.026489258, 0.010131836, 0.0012130737, -0.03125, -0.009521484, 0.022827148, 0.025390625, -0.026855469, 0.0119018555, -0.0050964355, -0.044433594, -0.0060119

# Creating a Vector Store for Chunks

In [70]:
# check if search_index is already built
try:
    search_index = AnnoyIndex(embedding_dimensions, 'angular')
    search_index.load('text.ann')
    print("Search index loaded.")
except FileNotFoundError:
    print("Search index not found. Building a new one.")
    # build the search index
    search_index = AnnoyIndex(embedding_dimensions, 'angular')
    for i in range(len(embeddings)):
        search_index.add_item(i, embeddings[i])
    search_index.build(10)
    search_index.save('text.ann')
embeds = embeddings.float
search_index = AnnoyIndex(np.array(embeds).shape[1], 'angular')
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i])
search_index.build(10)
search_index.save('text.ann')

Search index loaded.


True

# Creating a Vector Store for Entire Document

In [71]:
# check if search_index_entire is already built
try:
    search_index_entire = AnnoyIndex(embedding_dimensions, 'angular')
    search_index_entire.load('text_entire.ann')
    print("Search index for entire text loaded.")
except FileNotFoundError:
    print("Search index for entire text not found. Building a new one.")
    # build the search index for entire text
    search_index_entire = AnnoyIndex(embedding_dimensions, 'angular')
    for i in range(len(embeddings_entire)):
        search_index_entire.add_item(i, embeddings_entire[i])
    search_index_entire.build(10)
    search_index_entire.save('text_entire.ann')
# # create annoy index for entire text
# search_index_entire = AnnoyIndex(np.array(embeddings_entire).shape[1], 'angular')
# for i in range(len(embeddings_entire)):
#     search_index_entire.add_item(i, embeddings_entire[i])
# search_index_entire.build(10)
# search_index_entire.save('text_entire.ann')

Search index for entire text loaded.


# Creating Embeddings for Test Queries

In [156]:
good_query = "can you find me the vibration report for 696 chatham street west"
# create false query a random string the same length as the good query as a control

false_query = "".join(random.choices("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", k=len(good_query)))
queries = [
    "can you find me the vibration report for 696 chatham street west", 
    "can you find me the vibration report for the project on chatham street", 
    "can you find me vibration report for the office at ouellette",
    false_query]
query = queries.copy()
input_query_type = 'search_query'

query_embed = co.embed(
    texts=query,
    model=model_name,
    input_type=input_query_type,
    embedding_types=['float'],
    output_dimension=embedding_dimensions,
).embeddings.float
good_embed = query_embed[0]
false_embed = query_embed[1]


## This function takes in a query and a vector store, and returns the best similarity score

In [157]:
def compare(embed, search_index, chunks):
    # get the most similar items
    similar_item_ids = search_index.get_nns_by_vector(embed, 10, include_distances=True)
    # print most similar items
    num_items = 10
    # sort the items by distance
    sorted_items = sorted(zip(similar_item_ids[0], similar_item_ids[1]), key=lambda x: x[1])
    # print the most similar items
    for i in range(num_items if num_items < len(sorted_items) else len(sorted_items)):
        item_id, distance = sorted_items[i]
        print(f"Item ID: {item_id}, Distance: {distance} \n")
        chunk = chunks[item_id].strip().replace('\n', ' ')
        # remove repetitive spaces
        chunk = re.sub(r'\s+', ' ', chunk)
        print(f"Chunk: {chunk}")
        print(f"Similarity score: {distance}")
    best_distance = min(similar_item_ids[1])
    # best_similarity_score = abs(1/(2 - best_distance))
    best_similarity_score = best_distance
    return best_similarity_score

In [158]:
outputs_chunked = []

for query in query_embed:
    best_similarity_score = compare(query, search_index, chunks)
    # get the most similar items
    # similar_item_ids = search_index.get_nns_by_vector(query, 10, include_distances=True)
    # # print most similar items
    # # push best similarity score to outputs
    # best_distance = min(similar_item_ids[1])
    # best_similarity_score = abs(1/(1 - best_distance))
    outputs_chunked.append(best_similarity_score)


Item ID: 69, Distance: 0.8414503335952759 

Chunk: Appendix C PICTURES VIBRATION RESULTS NOISE METER MONI TORTING Receiver Location 1 Looking North on Caron Ave Looking West on Chatham Street Looking North from University Avenue Looking S outh from University Avenue Chatham Street Development Acoustical and Vibration Study Appendix D SOUND TRA NSMISSION CLASS Receiver 1 BR/LR COMPONENT F AREA W AREA STC Floor 4 -10 Bed Window 8.7 3.3 36 Floor 10-16 Bed Window 8.7 3.3 35 Floor 4 -10 Living Window 11.7 3.3 29 Floor 10-16 Living Window 11.7 3.3 28 Floor 4-16 Living Door 11.7 2.3 30 Receiver 2 BR/LR COMPONENT F AREA W AREA STC Floor 4 -10 Bed Window 8.7 3.3 35 Floor 10-16 Bed Window 8.7 3.3 34 Floor 4 -10 Living Window 11.7 3.3 27 Floor 10-16 Living Window 11.7 3.3 27 Floor 4-16 Living Door 11.7 2.3 29 Receiver 3 BR/LR COMPONENT F AREA W AREA STC Floor 4 -10 Bed Window 8.7 3.3 36 Floor 10-16 Bed Window 8.7 3.3 34 Floor 4 -10 Living Window 11.7 3.3 29 Floor 10-16 Living Window 11.7 3.3 27 F

In [159]:
outputs_entire = []
for query in query_embed:
    # get the most similar items from the entire index
    best_similarity_score_entire = compare(query, search_index_entire, [text])
    # similar_item_ids_entire = search_index_entire.get_nns_by_vector(query, 10, include_distances=True)
    # # print most similar items
    # # push best similarity score to outputs
    # best_distance_entire = min(similar_item_ids_entire[1])
    # best_similarity_score_entire = abs(1/(1 - best_distance_entire))
    outputs_entire.append(best_similarity_score_entire)

Item ID: 0, Distance: 1.2057173252105713 

Similarity score: 1.2057173252105713
Item ID: 0, Distance: 1.216853380203247 

Similarity score: 1.216853380203247
Item ID: 0, Distance: 1.262367606163025 

Similarity score: 1.262367606163025
Item ID: 0, Distance: 1.2570544481277466 

Similarity score: 1.2570544481277466


In [160]:
# visualize grouped scores dynamically from outputs_chunked and outputs_entire
def visualize_dynamic_scores(outputs_chunked, outputs_entire, queries, title):
    dynamic_scores = []
    for i, score in enumerate(outputs_chunked):
        query_type = queries[i]
        dynamic_scores.append((score, 'Chunked', query_type))
    for i, score in enumerate(outputs_entire):
        query_type = queries[i]
        dynamic_scores.append((score, 'Entire doc', query_type))

    df = pd.DataFrame(dynamic_scores, columns=['Score', 'Chunk Type', 'Query Type'])
    # maintain the order of queries
    df['Query Type'] = pd.Categorical(df['Query Type'], categories=queries, ordered=True)
    chart = alt.Chart(df).mark_bar().encode(
        x=alt.X('Query Type', sort=queries),
        y='Score',
        color='Query Type',
        column='Chunk Type'
    ).properties(
        title=title
    )
    return chart

# visualize the dynamic scores
# print(queries)
print("queries")
for i, query in enumerate(queries):
    print(f"{i}: {query}")
chart = visualize_dynamic_scores(outputs_chunked, outputs_entire, queries, 'Raw Distance Scores(lower is better)')
chart.show()

# also visualize unprocessed scores, scores are processed by 1/(2 - score), so undo that
def visualize_unprocessed_scores(outputs_chunked, outputs_entire, queries, title):
    unprocessed_scores = []
    for i, score in enumerate(outputs_chunked):
        query_type = queries[i]
        unprocessed_score = 1/(1 - (2 - score)/2)**2
        unprocessed_scores.append((unprocessed_score, 'Chunked', query_type))
    for i, score in enumerate(outputs_entire):
        query_type = queries[i]
        unprocessed_score = 1/(1 - (2 - score)/2)**2
        unprocessed_scores.append((unprocessed_score, 'Entire doc', query_type))

    df = pd.DataFrame(unprocessed_scores, columns=['Score', 'Chunk Type', 'Query Type'])
    # maintain the order of queries
    df['Query Type'] = pd.Categorical(df['Query Type'], categories=queries, ordered=True)
    chart = alt.Chart(df).mark_bar().encode(
        x=alt.X('Query Type', sort=queries),
        y='Score',
        color='Query Type',
        column='Chunk Type'
    ).properties(
        title=title
    )
    return chart
# visualize the unprocessed scores
print("queries")
for i, query in enumerate(queries):
    print(f"{i}: {query}")
chart = visualize_unprocessed_scores(outputs_chunked, outputs_entire, queries, 'Similarity Scores')
chart.show()



queries
0: can you find me the vibration report for 696 chatham street west
1: can you find me the vibration report for the project on chatham street
2: can you find me vibration report for the office at ouellette
3: CWvfmPN5WeR0a0xq31vCLX8VcLzi9IAw8iKguZgKa1YR9vhe3rtlR9StKemrtEbl


queries
0: can you find me the vibration report for 696 chatham street west
1: can you find me the vibration report for the project on chatham street
2: can you find me vibration report for the office at ouellette
3: CWvfmPN5WeR0a0xq31vCLX8VcLzi9IAw8iKguZgKa1YR9vhe3rtlR9StKemrtEbl
