In [63]:
import configparser
config = configparser.RawConfigParser()
config.read("../api_key.ini")

['../api_key.ini']

In [64]:
print(config.sections())

['API_KEY']


In [65]:
import cohere
API_KEY = config[config.sections()[0]]['api_key']
co = cohere.Client(API_KEY)

In [66]:
# Read processed txt file
with open("../data/processed/processed_data.txt", "r") as f:
    text = f.read()

In [67]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Create basic configurations to chunk the text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=10,
    length_function=len,
    is_separator_regex=False,
)

# Split the text into chunks with some overlap
chunks_ = text_splitter.create_documents([text])
chunks = [c.page_content for c in chunks_]
print(f"The text has been broken down in {len(chunks)} chunks.")

The text has been broken down in 216 chunks.


In [68]:
response = co.embed(
    texts= chunks,
    model="embed-english-v3.0",
    input_type="search_document",
    embedding_types=['float']
)
embeddings = response.embeddings.float
print(f"We just computed {len(embeddings)} embeddings.")

We just computed 216 embeddings.


In [69]:
import numpy as np
vector_database = {i: np.array(embedding) for i, embedding in enumerate(embeddings)}
# print(vector_database.items())

In [70]:
# from flask import current_app as app


# search_limit = app.config.get("SEARCH_LIMIT")

In [71]:
# created_connector = co.create_connector(
#             name="connector1",
#             url="https://connector-example.com/search",
#         )

In [72]:
# connectors=[{"id": "web-search"}, {"id": "customer-connector-id"}]

In [73]:
# add the user's message to the chat history
# chat.chat_history.append(
#     ChatMessage(
#         role="USER", message="I want to know the number of orders from the last week.",
#     )
# )

In [74]:
query = "I want to know the company with the most number of orders from the last week."

In [75]:
# Because the text being embedded is the search query, we set the input type as search_query
response = co.embed(
    texts=[query],
    model="embed-english-v3.0",
    input_type="search_query",
    embedding_types=['float']
)
query_embedding = response.embeddings.float[0]
print("query_embedding: ", query_embedding)

query_embedding:  [0.002998352, -0.021011353, -0.026168823, -0.03390503, -0.028396606, -0.01399231, -0.03225708, 0.020401001, -0.014503479, -0.0005517006, 0.029647827, 0.036254883, -0.05899048, -0.039764404, 0.017959595, -0.0146865845, 0.007888794, 0.017578125, 0.049041748, 0.009185791, 0.031921387, 0.011894226, 0.018218994, -0.032928467, 0.001420021, 0.052001953, -0.015205383, -0.034973145, -0.0048446655, 0.00066804886, -0.00919342, -0.0068740845, 0.03564453, -0.051727295, 0.023834229, -0.012680054, 0.0026760101, -0.012771606, -0.004295349, -0.028015137, -0.006958008, 0.012062073, 0.03656006, 0.013809204, -0.009750366, -0.025482178, 0.021148682, -0.023117065, -0.005367279, 0.0038585663, -0.034606934, -0.021652222, -0.01625061, 0.043426514, -0.02557373, -0.0033302307, -0.016326904, 0.008003235, 0.024093628, -0.011413574, -0.022064209, 0.0012187958, 0.02607727, 0.019851685, -0.024673462, 0.00070667267, 0.0012226105, 0.030075073, 0.056518555, -0.025863647, -0.055633545, 0.023666382, 0.01

In [76]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Calculate similarity between the user question & each chunk
similarities = [cosine_similarity(query_embedding, chunk) for chunk in embeddings]
print("similarity scores: ", similarities)

# Get indices of the top 10 most similar chunks
sorted_indices = np.argsort(similarities)[::-1]

# Keep only the top 10 indices
top_indices = sorted_indices[:10]
print("Here are the indices of the top 10 chunks after retrieval: ", top_indices)

# Retrieve the top 10 most similar chunks
top_chunks_after_retrieval = [chunks[i] for i in top_indices]
print("Here are the top 10 chunks after retrieval: ")
for t in top_chunks_after_retrieval:
    print("== " + t)

similarity scores:  [0.19735060379034985, 0.18745365147579593, 0.20127570055731128, 0.16501056131026207, 0.18620543033439244, 0.17871383239795793, 0.18416427449568268, 0.20405121517676159, 0.18371309256002027, 0.18480619409783489, 0.1911918449497137, 0.18335934553977967, 0.19463815687543123, 0.19683399733618667, 0.18308676934222748, 0.19924703556703524, 0.16178255238710215, 0.18138049111167384, 0.1749983592356369, 0.19085751837459236, 0.2033818213836378, 0.18084058295997488, 0.18787061555366474, 0.18580319294830655, 0.1737393555215823, 0.1950819293628719, 0.198254417951345, 0.18791147888409926, 0.20227402691241436, 0.16265615612612153, 0.18649159001053406, 0.17714535924609245, 0.19509484315686396, 0.20131298395921265, 0.17852274350796904, 0.1850490399198802, 0.18617994604965865, 0.17284260548138158, 0.19514433044903062, 0.24450774564074057, 0.2448523294627634, 0.2734316212547236, 0.2622482717793046, 0.27126438793323815, 0.2441421845084376, 0.23229562170381765, 0.2888175099640865, 0.240

In [77]:
print(response)



In [78]:
response = co.rerank(
    query=query,
    documents=top_chunks_after_retrieval,
    top_n=3,
    model="rerank-english-v2.0",
)

top_chunks_after_rerank = [result.document['text'] for result in response]
print("Here are the top 3 chunks after rerank: ")
for t in top_chunks_after_rerank:
    print("== " + t)

AttributeError: 'tuple' object has no attribute 'document'

In [None]:
from cohere import ChatMessage
chat = co.chat(
    message="", # sample message -> should be the user input
    model="command",
    chat_history=[
        ChatMessage(
            role="CHATBOT", message="Hi!",
        ), ChatMessage(
            role="CHATBOT", message="How can I help you today?",
        ),
    ],
    preamble="Based on the data from the last week,",
    prompt_truncation="AUTO",
    temperature=0.2,
    
)

print(chat)
# AND WE SHOULD RETURN THE ANSWER TO THE WEB APP