In [None]:
from openai_completion import OpenAIAssistant
from utils.utils import replace_placeholders_in_dict

# Description
This notebook goes through all the possible sources and tests the different decision making of the LLM. Use it for debugging.

In [None]:
source_decision = {
    "system_message": "You are helpful assistant that will reply as short as possible",
    "user_messages": ["""<Context>
    You need to find the answer to the question
        <Question> 
            {question}
        </Question>     
    and you have the following sources available:
</Context>

<Sources>
    1. CSV : contains personal information about when I was in specific countries
    2. SQL : contains temperature information in different cities around the world for the past 20 years
    3. Chroma : contains wikipedia scraped information about countries
    4. Wikipedia-Api: all of wikipedia
</Sources>

<Instructions>
    Return which source you want to use for the query , only the source
</Instructions> """]
    ,
    "assistant_messages": [],
}

In [None]:
question1 = "What is the capital of france"
question2 = "Where was I in june 2019"
question3 = "What was the temperature in paris in february 2020"

In [None]:
prompt1 = replace_placeholders_in_dict(source_decision, {"question": question1})
prompt2 = replace_placeholders_in_dict(source_decision, {"question": question2})
prompt3 = replace_placeholders_in_dict(source_decision, {"question": question3})

In [None]:
assistant = OpenAIAssistant(model="gpt-3.5-turbo")
response1 = assistant.get_openai_completion(**prompt1)

In [None]:
response1

In [None]:
response2 = assistant.get_openai_completion(**prompt2)
response2

In [None]:
response3 = assistant.get_openai_completion(**prompt3)
response3

## csv prompt

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("./data/trips.csv")
data = df.to_records().tolist()

In [None]:
csv_prompt = {
    "system_message": "You are a JSON machine that can only type JSON",
    "user_messages": [
        "What source would you like to use to answer the question: {question}",
        """<Context>
    Here is the csv file formatted as a list of tuples with the following elements for each tuple:
    <Format>
        Element 1: Row nr
        Element 2: Country
        Element 3: City
        Element 4: Date of visit
        Element 5: Who I was with
        Element 6: Reason for visit
    </Format>
    <Data>
        {data}
    </Data>
</Context>
<Instructions>
    Answer the question in the following format based on the data above, reply only JSON:
    <Format>
        "Answer": the answer to the question in one sentence,
        "Missing_information": True/False (if the answer is not in the data)
</Instructions>""",
    ],
    "assistant_messages": [
        "I would like to use the CSV source, which contains personal information about trips"
    ],
}

In [None]:
question_csv = "Where was I in june 2019, and who was I with"
prompt_csv = replace_placeholders_in_dict(csv_prompt, {"question": question_csv, "data": data})

In [None]:
response_csv = assistant.get_openai_completion(**prompt_csv)
response_csv

In [None]:
print(response_csv["choices"][0]["message"]["content"])

## SQL

In [None]:
sql_prompt = {
    "system_message": "You are a SQL machine that can only type SQL ",
    "user_messages": [
        """What source would you like to use to answer the question: 
        <Question>
            {question}
        </Question>
            """,
        """<Context>
    You have one table available called Temperature with the following schema.
    <Table Schema>
        (0, 'region', 'TEXT', 0, None, 0)
        (1, 'country', 'TEXT', 0, None, 0)
        (2, 'state', 'TEXT', 0, None, 0)
        (3, 'city', 'TEXT', 0, None, 0)
        (4, 'month', 'INTEGER', 0, None, 0)
        (5, 'day', 'INTEGER', 0, None, 0)
        (6, 'year', 'INTEGER', 0, None, 0)
        (7, 'avgtemperaturef', 'REAL', 0, None, 0)
        (8, 'avgtemperaturec', 'REAL', 0, None, 0)
    </Table Schema>
</Context>
<Instructions> 
   Your task is to return a SQL query to answer the question, only SQL without any other text 
</Instructions>""",
    ],
    "assistant_messages": [
        "I would like to use the SQL source, which contains temperature information in different cities around the world for the past 20 years"
    ],
}

In [None]:
question_sql = "What was the temperature in Paris the 10th of february 2020"
prompt_sql = replace_placeholders_in_dict(sql_prompt, {"question": question_sql})
prompt_sql

In [None]:
response_sql = assistant.get_openai_completion(**prompt_sql)
response_sql

In [None]:
query_response = response_sql["choices"][0]["message"]["content"]
print(response_sql["choices"][0]["message"]["content"])

In [None]:
sql_prompt_followup = {
    "system_message": "You are helpful assistant ",
    "user_messages": [
        """What source would you like to use to answer the question: 
        <Question>
            {question}
        </Question>
            """,
        """<Context>
                From querying the SQL database witht he following query:
                <Query>
                    {query}
                </Query>
                You get the following result:
                <Result>
                    {result}
                </Result>
            </Context>
            <Instructions>
                Answer the question in the following format based on the data above, reply only JSON:
                <Format>
                    "Answer": the answer to the question in one sentence,
                    "Missing_information": True/False (if the answer is not in the data)
            </Instructions>""",
    ],
    "assistant_messages": [
        "I would like to use the SQL source, which contains temperature information in different cities around the world for the past 20 years"
    ],
}

In [None]:
query_response

In [None]:
# query the database
import sqlite3

conn = sqlite3.connect("./data/my_database.db")
cursor = conn.cursor()

cursor.execute("SELECT avgtemperaturec\nFROM Temperature\nWHERE city = 'Paris' AND day = 10 AND month = 2 AND year = 2020;")
result = cursor.fetchall()

print(result)

In [None]:
followup_sql = replace_placeholders_in_dict(
    sql_prompt_followup, {"question": question_sql, "query": response_sql["choices"][0]["message"]["content"], "result": result}
)

In [None]:
response_sql_followup = assistant.get_openai_completion(**followup_sql)
response_sql_followup

In [None]:
print(response_sql_followup["choices"][0]["message"]["content"])

## Chroma

In [None]:
import chromadb
from chromadb.utils import embedding_functions
from dotenv import load_dotenv
load_dotenv()
import os
import json

In [None]:
hf_api_key = os.getenv("HF_API_KEY")

In [None]:
hf_ef = embedding_functions.HuggingFaceEmbeddingFunction(hf_api_key, model_name="BAAI/bge-base-en-v1.5")

In [None]:
chroma_client = chromadb.PersistentClient(path="./data/chroma_db")
collection = chroma_client.get_or_create_collection("country_information", embedding_function=hf_ef)

### Split the data in chunks of 1000

In [None]:
#load data
with open("./data/countries/country_information.json", "r") as f:
    countries_information = json.load(f)

In [None]:
len(countries_information["Algeria"]["content"])
len(countries_information["Algeria"]["summary"])

### Divide in chunks ending at fullstops

In [None]:
import re

def split_into_chunks(text, chunk_size=300, min_last_chunk_size=100):
    words = text.split()
    chunks = []
    chunk = []
    i = 0

    while i < len(words):
        word = words[i]
        if len(chunk) + len(word.split()) <= chunk_size:
            chunk.extend(word.split())
            i += 1
        else:
            if word.endswith('.'):
                chunk.extend(word.split())
                chunks.append(chunk)
                chunk = []
                i += 1
            else:
                # Look for a period in the next few words to find a better breaking point
                temp_chunk = chunk.copy()
                lookahead_pos = i
                found_period = False
                while lookahead_pos < len(words):
                    next_word = words[lookahead_pos]
                    temp_chunk.extend(next_word.split())
                    lookahead_pos += 1
                    if next_word.endswith('.'):
                        found_period = True
                        chunk = temp_chunk
                        chunks.append(chunk)
                        chunk = []
                        i = lookahead_pos  # Update main loop's position
                        break

                if not found_period:
                    chunks.append(chunk)
                    chunk = [word]
                    i += 1

    if chunk:
        if len(chunk) < min_last_chunk_size and chunks:
            chunks[-1].extend(chunk)
        else:
            chunks.append(chunk)

    return [' '.join(chunk) for chunk in chunks]

def process_json(data):
    documents = []
    metadatas = []
    ids = []

    id_counter = 1

    for country, details in data.items():
        for source in ['content', 'summary']:
            text = details[source]
            chunks = split_into_chunks(text)

            for idx, chunk in enumerate(chunks):
                documents.append(chunk)
                last_word_prev_chunk = 'None' if idx == 0 else chunks[idx-1].split()[-1]
                
                if idx < len(chunks) - 1:
                    next_chunk_first_words = ' '.join(chunks[idx+1].split()[:3])
                else:
                    next_chunk_first_words = 'None'
                
                metadata = {
                    "country": country,
                    "paragraph": idx + 1,
                    "last_word": chunk.split()[-1],
                    "next_words": next_chunk_first_words,
                    "last_word_prev_chunk": last_word_prev_chunk,
                    "source": source
                }
                metadatas.append(metadata)
                ids.append(f"{id_counter}")
                id_counter += 1

    return documents, metadatas, ids


documents, metadatas, ids = process_json(countries_information)


In [None]:
print(documents[2])
print(metadatas[2])
print(ids[3])

In [None]:
from sentence_transformers import SentenceTransformer
sentences_1 = documents
model = SentenceTransformer('BAAI/bge-base-en-v1.5')
embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)

In [None]:
embeddings = embeddings_1.tolist()
len(embeddings)

In [None]:
collection.add(embeddings=embeddings, documents=documents, metadatas=metadatas, ids=ids)

In [None]:
collection.query(query_texts=["What is the capital of France?"], n_results=4)

In [None]:
results_query = collection.query(query_texts=["What is the capital of France?"], n_results=5)
results_query["documents"]