In [204]:
import configparser
config = configparser.RawConfigParser()
config.read("../api_key.ini")

['../api_key.ini']

In [205]:
print(config.sections())

['API_KEY']


In [206]:
import cohere
API_KEY = config[config.sections()[0]]['api_key']
co = cohere.Client(API_KEY)

In [207]:
# Read processed txt file
with open("../data/processed/processed_data.txt", "r") as f:
    text = f.read()

In [208]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Create basic configurations to chunk the text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=10,
    length_function=len,
    is_separator_regex=False,
)

# Split the text into chunks with some overlap
chunks_ = text_splitter.create_documents([text])
chunks = [c.page_content for c in chunks_]
print(f"The text has been broken down in {len(chunks)} chunks.")

The text has been broken down in 216 chunks.


In [209]:
response = co.embed(
    texts= chunks,
    model="embed-english-v3.0",
    input_type="search_document",
    embedding_types=['float']
)
embeddings = response.embeddings.float
print(f"We just computed {len(embeddings)} embeddings.")

We just computed 216 embeddings.


In [210]:
import numpy as np
vector_database = {i: np.array(embedding) for i, embedding in enumerate(embeddings)}

In [211]:
# from flask import current_app as app


# search_limit = app.config.get("SEARCH_LIMIT")

In [212]:
# created_connector = co.create_connector(
#             name="connector1",
#             url="https://connector-example.com/search",
#         )

In [213]:
# connectors=[{"id": "web-search"}, {"id": "customer-connector-id"}]

In [214]:
# add the user's message to the chat history
# chat.chat_history.append(
#     ChatMessage(
#         role="USER", message="I want to know the number of orders from the last week.",
#     )
# )

In [215]:
query = "What was our total revenue from last week ?"

In [216]:
# Because the text being embedded is the search query, we set the input type as search_query
response = co.embed(
    texts=[query],
    model="embed-english-v3.0",
    input_type="search_query",
    embedding_types=['float']
)
query_embedding = response.embeddings.float[0]
print("query_embedding: ", query_embedding)

query_embedding:  [0.02381897, -0.032470703, -0.040008545, -0.031433105, -0.038726807, -0.04055786, -0.0067749023, 0.05001831, 0.01108551, -0.017837524, -0.026428223, 0.02758789, -0.045410156, -0.026184082, 0.035339355, -0.022644043, 0.016799927, -0.028579712, 0.022903442, -0.004508972, 0.038391113, -0.006717682, 0.058898926, -0.0053749084, -0.0036964417, 0.018508911, -0.04827881, 0.0017585754, -0.018814087, -0.00699234, -0.0057525635, 0.015037537, 0.013900757, -0.017852783, 0.019821167, -0.011260986, 0.05895996, -0.040130615, -0.0001937151, 0.0062789917, -0.008346558, -0.0032215118, 0.02885437, -0.043304443, -0.0073890686, 0.024398804, -0.022338867, -0.022506714, -0.009544373, 0.025878906, -0.019760132, -0.003156662, 0.004852295, 0.03527832, -0.019927979, 0.011207581, -0.031311035, -0.018508911, 0.053497314, 0.021774292, -0.016296387, -0.039245605, 0.015792847, -0.04333496, -0.040130615, -0.041900635, -0.029754639, 0.022842407, 0.03665161, -0.027877808, 0.016799927, 0.061065674, 0.015

In [217]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Calculate similarity between the user question & each chunk
similarities = [cosine_similarity(query_embedding, chunk) for chunk in embeddings]
print("similarity scores: ", similarities)

# Get indices of the top 10 most similar chunks
sorted_indices = np.argsort(similarities)[::-1]

# Keep only the top 10 indices
top_indices = sorted_indices[:100]
print("Here are the indices of the top 100 chunks after retrieval: ", top_indices)

# Retrieve the top 10 most similar chunks
top_chunks_after_retrieval = [chunks[i] for i in top_indices]
print("Here are the top 10 chunks after retrieval: ")
for t in top_chunks_after_retrieval:
    print("== " + t)

similarity scores:  [0.41717228016092284, 0.4188158450269509, 0.419766038696218, 0.4169232832722593, 0.4229889984929003, 0.41168341738647457, 0.4271075944122892, 0.41071390772644284, 0.42271261153013795, 0.39974090170428345, 0.43157178416457387, 0.4130843995096269, 0.4171533044833727, 0.418206971365064, 0.4184277031655353, 0.4165029605633698, 0.41413324172040517, 0.4192036234244964, 0.40991552669762926, 0.4230024821968877, 0.4171453695240185, 0.4177248396948615, 0.40039437579443377, 0.42496888934291166, 0.40789494301112905, 0.4191857720372448, 0.42079213460790815, 0.4204635967060326, 0.41792936635770234, 0.4143271016681167, 0.4229314120547407, 0.41014905643426064, 0.427010265073673, 0.4147101121208574, 0.4181504080307799, 0.40341195689030573, 0.4267600155504714, 0.41012172323125073, 0.4145339218056802, 0.4170524205475181, 0.3631540341314049, 0.4283728722344874, 0.3829537171113998, 0.36854515034652746, 0.3893661813289152, 0.4006165822102692, 0.3946675443945951, 0.3955117507644023, 0.372

In [218]:
print(response)



In [219]:
ranked_response = co.rerank(
    query=query,
    documents=top_chunks_after_retrieval,
    top_n=10,
    model="rerank-english-v2.0",
    return_documents=True
)

In [220]:
print(ranked_response.results)

[RerankResponseResultsItem(document=RerankResponseResultsItemDocument(text='The product with Pearl Roadshow 5-Piece Drum Set description has resulted in 42000 euros in revenue this week.\nThe product with Conn 52BSP CONNstellation Series Bb Trumpet description has resulted in 42000 euros in revenue this week.'), index=87, relevance_score=0.24653335), RerankResponseResultsItem(document=RerankResponseResultsItemDocument(text='The product with PDP by DW 7-Piece Concept Maple Shell Pack description has resulted in 57000 euros in revenue this week.\nThe product with Universal Audio Apollo Twin X description has resulted in 55800 euros in revenue this week.'), index=68, relevance_score=0.23405632), RerankResponseResultsItem(document=RerankResponseResultsItemDocument(text="The product with D'Addario Helicore Double Bass Strings description has resulted in 10600 euros in revenue this week.\nThe product with Bunnel Pupil Violin description has resulted in 10600 euros in revenue this week."), in

In [221]:
top_chunks_after_rerank = [result.document.text for result in ranked_response.results]
print("Here are the top 3 chunks after rerank: ")
for t in top_chunks_after_rerank:
    print("== " + t)

Here are the top 3 chunks after rerank: 
== The product with Pearl Roadshow 5-Piece Drum Set description has resulted in 42000 euros in revenue this week.
The product with Conn 52BSP CONNstellation Series Bb Trumpet description has resulted in 42000 euros in revenue this week.
== The product with PDP by DW 7-Piece Concept Maple Shell Pack description has resulted in 57000 euros in revenue this week.
The product with Universal Audio Apollo Twin X description has resulted in 55800 euros in revenue this week.
== The product with D'Addario Helicore Double Bass Strings description has resulted in 10600 euros in revenue this week.
The product with Bunnel Pupil Violin description has resulted in 10600 euros in revenue this week.
== The product with Sonor SQ1 3-Piece Shell Pack description has resulted in 173600 euros in revenue this week.
The product with Behringer X32 Digital Mixer description has resulted in 158400 euros in revenue this week.
== The product with EarthQuaker Devices Avalanch

In [222]:
# retrieved documents
documents = [{"title": "chunk "+str(i), "snippet": top_chunks_after_rerank[i]} for i in range(0, len(top_chunks_after_rerank))]
# documents = [
#     {"title": "chunk 0", "snippet": top_chunks_after_rerank[0]},
#     {"title": "chunk 1", "snippet": top_chunks_after_rerank[1]},
#     {"title": "chunk 2", "snippet": top_chunks_after_rerank[2]},
#     {"title": "chunk 3", "snippet": top_chunks_after_rerank[3]},
#     {"title": "chunk 4", "snippet": top_chunks_after_rerank[4]},
#   ]

In [223]:
preamble = """
##Task & Context: You are an AI assistant integrated with the company's Enterprise Resource Planning (ERP) system containing data from the last week only.
Your role is to help employees access and understand data from the ERP system through natural language interactions.
Employees will ask you questions or make requests related to various business operations like inventory, sales, accounting, manufacturing, etc. 
Your task is to query the relevant data from the ERP databases, analyze and synthesize it as needed, and provide helpful responses to the employees.

##Style Guide:
Use clear, professional language tailored for a workplace context.
Respond succinctly when possible, but provide detailed explanations when the query requires it.
Maintain objectivity and avoid injecting personal opinions unless explicitly asked.
Speak in the first-person from the perspective of an AI assistant (e.g. "I retrieved the latest inventory data from the system.")
Use proper formatting like bulleted lists, tables, and code snippets where appropriate to make responses easier to parse.
Do not include unrequested data, but do provide relevant additional context if it can enhance the usefulness of your response.
Be polite and constructive. If you cannot fulfill a request, explain why in a respectful manner.
"""

In [224]:
from cohere import ChatMessage
chat = co.chat(
    message="Find the revenue for each product sold and calculate the sum.", 
    model="command",
    chat_history=[
        ChatMessage(
            role="CHATBOT", message="Hi!",
        ), ChatMessage(
            role="CHATBOT", message="How can I help you today?",
        ),
    ],
    preamble=preamble,
    prompt_truncation="AUTO",
    temperature=0.2,
    documents=documents
)

print("Final answer:\n")
print(chat.text)
# AND WE SHOULD RETURN THE ANSWER TO THE WEB APP

Final answer:

I retrieved the latest revenue data from the system. 

Here are the products sold and the revenue for each, sorted in descending order:

| Product Name | Revenue |
|---|---|
| Sonor SQ1 3-Piece Shell Pack | 173600 euros |
| Behringer X32 Digital Mixer | 158400 euros |
| Universal Audio Apollo Twin X | 55800 euros |
| PDP by DW 7-Piece Concept Maple Shell Pack | 57000 euros |
| Electro-Voice ELX200-10P Powered Speaker | 28400 euros |
| Jean Paul USA AS-400 Student Alto | 28500 euros |
| Native Instruments Komplete Audio 6 | 17750 euros |
| Sonor AQ2 Safari 4-Piece Drum Set | 17500 euros |
| IK Multimedia iRig Keys 37 Pro | 7200 euros |
| Cecilio CVN-300 Ebony Fitted Violin | 7150 euros |
| Pearl D50 Drum Throne | 3660 euros |
| D'Addario Prelude Violin String Set | 3760 euros | 

The total sum of revenue from all products sold this week is **518400 euros**.
