In [26]:
from openai_completion import OpenAIAssistant
from utils.utils import replace_placeholders_in_dict

In [18]:
source_decision = {
    "system_message": "You are helpful assistant that will reply as short as possible",
    "user_messages": ["""<Context>
    You need to find the answer to the question
        <Question> 
            {question}
        </Question>     
    and you have the following sources available:
</Context>

<Sources>
    1. CSV : contains personal information about when I was in specific countries
    2. SQL : contains temperature information in different cities around the world for the past 20 years
    3. Chroma : contains wikipedia scraped information about countries
    4. Wikipedia-Api: all of wikipedia
</Sources>

<Instructions>
    Return which source you want to use for the query , only the source
</Instructions> """]
    ,
    "assistant_messages": [],
}

In [19]:
question1 = "What is the capital of france"
question2 = "Where was I in june 2019"
question3 = "What was the temperature in paris in february 2020"

In [27]:
prompt1 = replace_placeholders_in_dict(source_decision, {"question": question1})
prompt2 = replace_placeholders_in_dict(source_decision, {"question": question2})
prompt3 = replace_placeholders_in_dict(source_decision, {"question": question3})

In [49]:
assistant = OpenAIAssistant(model="gpt-3.5-turbo")
response1 = assistant.get_openai_completion(**prompt1)

In [22]:
response1

<OpenAIObject chat.completion id=chatcmpl-89eEiFNSHjDPNLVKq5K2dIZyCh04q at 0x1facebf2d90> JSON: {
  "id": "chatcmpl-89eEiFNSHjDPNLVKq5K2dIZyCh04q",
  "object": "chat.completion",
  "created": 1697310312,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "I would use the Chroma source for the query."
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 160,
    "completion_tokens": 11,
    "total_tokens": 171
  }
}

In [23]:
response2 = assistant.get_openai_completion(**prompt2)
response2

<OpenAIObject chat.completion id=chatcmpl-89eEnDs8c75pGPnmsi640oppDvwDg at 0x1fabec50d60> JSON: {
  "id": "chatcmpl-89eEnDs8c75pGPnmsi640oppDvwDg",
  "object": "chat.completion",
  "created": 1697310317,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "CSV"
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 163,
    "completion_tokens": 1,
    "total_tokens": 164
  }
}

In [24]:
response3 = assistant.get_openai_completion(**prompt3)
response3

<OpenAIObject chat.completion id=chatcmpl-89eF6B9iWvBu5NLHHsPdj9GYQQS4B at 0x1fabed01080> JSON: {
  "id": "chatcmpl-89eF6B9iWvBu5NLHHsPdj9GYQQS4B",
  "object": "chat.completion",
  "created": 1697310336,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "SQL"
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 166,
    "completion_tokens": 1,
    "total_tokens": 167
  }
}

## csv prompt

In [31]:
import pandas as pd

In [36]:
df = pd.read_csv("./data/trips.csv")
data = df.to_records().tolist()

In [50]:
csv_prompt = {
    "system_message": "You are a JSON machine that can only type JSON",
    "user_messages": [
        "What source would you like to use to answer the question: {question}",
        """<Context>
    Here is the csv file formatted as a list of tuples with the following elements for each tuple:
    <Format>
        Element 1: Row nr
        Element 2: Country
        Element 3: City
        Element 4: Date of visit
        Element 5: Who I was with
        Element 6: Reason for visit
    </Format>
    <Data>
        {data}
    </Data>
</Context>
<Instructions>
    Answer the question in the following format based on the data above, reply only JSON:
    <Format>
        "Answer": the answer to the question in one sentence,
        "Missing_information": True/False (if the answer is not in the data)
</Instructions>""",
    ],
    "assistant_messages": [
        "I would like to use the CSV source, which contains personal information about trips"
    ],
}

In [51]:
question_csv = "Where was I in june 2019, and who was I with"
prompt_csv = replace_placeholders_in_dict(csv_prompt, {"question": question_csv, "data": data})

In [52]:
response_csv = assistant.get_openai_completion(**prompt_csv)
response_csv

<OpenAIObject chat.completion id=chatcmpl-89eZCeBi7pt1N5eceMYYjrWLIZGcE at 0x1facfab9bc0> JSON: {
  "id": "chatcmpl-89eZCeBi7pt1N5eceMYYjrWLIZGcE",
  "object": "chat.completion",
  "created": 1697311582,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "{\n  \"Answer\": \"In June 2019, you were in Germany (Wuppertal) and you were alone.\",\n  \"Missing_information\": false\n}"
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 1145,
    "completion_tokens": 35,
    "total_tokens": 1180
  }
}

In [53]:
print(response_csv["choices"][0]["message"]["content"])

{
  "Answer": "In June 2019, you were in Germany (Wuppertal) and you were alone.",
  "Missing_information": false
}


## SQL

In [67]:
sql_prompt = {
    "system_message": "You are a SQL machine that can only type SQL ",
    "user_messages": [
        """What source would you like to use to answer the question: 
        <Question>
            {question}
        </Question>
            """,
        """<Context>
    You have one table available called Temperature with the following schema.
    <Table Schema>
        (0, 'region', 'TEXT', 0, None, 0)
        (1, 'country', 'TEXT', 0, None, 0)
        (2, 'state', 'TEXT', 0, None, 0)
        (3, 'city', 'TEXT', 0, None, 0)
        (4, 'month', 'INTEGER', 0, None, 0)
        (5, 'day', 'INTEGER', 0, None, 0)
        (6, 'year', 'INTEGER', 0, None, 0)
        (7, 'avgtemperaturef', 'REAL', 0, None, 0)
        (8, 'avgtemperaturec', 'REAL', 0, None, 0)
    </Table Schema>
</Context>
<Instructions> 
   Your task is to return a SQL query to answer the question, only SQL without any other text 
</Instructions>""",
    ],
    "assistant_messages": [
        "I would like to use the SQL source, which contains temperature information in different cities around the world for the past 20 years"
    ],
}

In [89]:
question_sql = "What was the temperature in Paris the 10th of february 2020"
prompt_sql = replace_placeholders_in_dict(sql_prompt, {"question": question_sql})
prompt_sql

{'system_message': 'You are a SQL machine that can only type SQL ',
 'user_messages': ['What source would you like to use to answer the question: \n        <Question>\n            What was the temperature in Paris the 10th of february 2020\n        </Question>\n            ',
  "<Context>\n    You have one table available called Temperature with the following schema.\n    <Table Schema>\n        (0, 'region', 'TEXT', 0, None, 0)\n        (1, 'country', 'TEXT', 0, None, 0)\n        (2, 'state', 'TEXT', 0, None, 0)\n        (3, 'city', 'TEXT', 0, None, 0)\n        (4, 'month', 'INTEGER', 0, None, 0)\n        (5, 'day', 'INTEGER', 0, None, 0)\n        (6, 'year', 'INTEGER', 0, None, 0)\n        (7, 'avgtemperaturef', 'REAL', 0, None, 0)\n        (8, 'avgtemperaturec', 'REAL', 0, None, 0)\n    </Table Schema>\n</Context>\n<Instructions> \n   Your task is to return a SQL query to answer the question, only SQL without any other text \n</Instructions>"],
 'assistant_messages': ['I would like 

In [90]:
response_sql = assistant.get_openai_completion(**prompt_sql)
response_sql

<OpenAIObject chat.completion id=chatcmpl-89etld4iRHDgBY6WD3Y93TQncsJKW at 0x1facf6246d0> JSON: {
  "id": "chatcmpl-89etld4iRHDgBY6WD3Y93TQncsJKW",
  "object": "chat.completion",
  "created": 1697312857,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "SELECT avgtemperaturec \nFROM Temperature \nWHERE city = 'Paris' AND month = 2 AND day = 10 AND year = 2020;"
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 318,
    "completion_tokens": 31,
    "total_tokens": 349
  }
}

In [91]:
query_response = response_sql["choices"][0]["message"]["content"]
print(response_sql["choices"][0]["message"]["content"])

SELECT avgtemperaturec 
FROM Temperature 
WHERE city = 'Paris' AND month = 2 AND day = 10 AND year = 2020;


In [92]:
sql_prompt_followup = {
    "system_message": "You are helpful assistant ",
    "user_messages": [
        """What source would you like to use to answer the question: 
        <Question>
            {question}
        </Question>
            """,
        """<Context>
                From querying the SQL database witht he following query:
                <Query>
                    {query}
                </Query>
                You get the following result:
                <Result>
                    {result}
                </Result>
            </Context>
            <Instructions>
                Answer the question in the following format based on the data above, reply only JSON:
                <Format>
                    "Answer": the answer to the question in one sentence,
                    "Missing_information": True/False (if the answer is not in the data)
            </Instructions>""",
    ],
    "assistant_messages": [
        "I would like to use the SQL source, which contains temperature information in different cities around the world for the past 20 years"
    ],
}

In [93]:
query_response

"SELECT avgtemperaturec \nFROM Temperature \nWHERE city = 'Paris' AND month = 2 AND day = 10 AND year = 2020;"

In [94]:
# query the database
import sqlite3

conn = sqlite3.connect("./data/my_database.db")
cursor = conn.cursor()

cursor.execute("SELECT avgtemperaturec\nFROM Temperature\nWHERE city = 'Paris' AND day = 10 AND month = 2 AND year = 2020;")
result = cursor.fetchall()

print(result)

[(7.9,)]


In [96]:
followup_sql = replace_placeholders_in_dict(
    sql_prompt_followup, {"question": question_sql, "query": response_sql["choices"][0]["message"]["content"], "result": result}
)

In [98]:
response_sql_followup = assistant.get_openai_completion(**followup_sql)
response_sql_followup

<OpenAIObject chat.completion id=chatcmpl-89eueToJsDtPePAbfacVn3B86CiL5 at 0x1facfdff060> JSON: {
  "id": "chatcmpl-89eueToJsDtPePAbfacVn3B86CiL5",
  "object": "chat.completion",
  "created": 1697312912,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "{\n    \"Answer\": \"The temperature in Paris on the 10th of February 2020 was 7.9 degrees Celsius.\",\n    \"Missing_information\": false\n}"
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 231,
    "completion_tokens": 36,
    "total_tokens": 267
  }
}

In [99]:
print(response_sql_followup["choices"][0]["message"]["content"])

{
    "Answer": "The temperature in Paris on the 10th of February 2020 was 7.9 degrees Celsius.",
    "Missing_information": false
}


## Chroma

In [9]:
import chromadb
from chromadb.utils import embedding_functions
from dotenv import load_dotenv
load_dotenv()
import os
import json

In [6]:
hf_api_key = os.getenv("HF_API_KEY")

In [7]:
hf_ef = embedding_functions.HuggingFaceEmbeddingFunction(hf_api_key, model_name="BAAI/bge-base-en-v1.5")

In [8]:
chroma_client = chromadb.PersistentClient(path="./data/chroma_db")
collection = chroma_client.get_or_create_collection("country_information", embedding_function=hf_ef)

### Split the data in chunks of 1000

In [10]:
#load data
with open("./data/countries/country_information.json", "r") as f:
    countries_information = json.load(f)

In [14]:
len(countries_information["Algeria"]["content"])
len(countries_information["Algeria"]["summary"])

3356

### Divide in chunks ending at fullstops

In [38]:
import re

def split_into_chunks(text, chunk_size=300, min_last_chunk_size=100):
    words = text.split()
    chunks = []
    chunk = []
    i = 0

    while i < len(words):
        word = words[i]
        if len(chunk) + len(word.split()) <= chunk_size:
            chunk.extend(word.split())
            i += 1
        else:
            if word.endswith('.'):
                chunk.extend(word.split())
                chunks.append(chunk)
                chunk = []
                i += 1
            else:
                # Look for a period in the next few words to find a better breaking point
                temp_chunk = chunk.copy()
                lookahead_pos = i
                found_period = False
                while lookahead_pos < len(words):
                    next_word = words[lookahead_pos]
                    temp_chunk.extend(next_word.split())
                    lookahead_pos += 1
                    if next_word.endswith('.'):
                        found_period = True
                        chunk = temp_chunk
                        chunks.append(chunk)
                        chunk = []
                        i = lookahead_pos  # Update main loop's position
                        break

                if not found_period:
                    chunks.append(chunk)
                    chunk = [word]
                    i += 1

    if chunk:
        if len(chunk) < min_last_chunk_size and chunks:
            chunks[-1].extend(chunk)
        else:
            chunks.append(chunk)

    return [' '.join(chunk) for chunk in chunks]

def process_json(data):
    documents = []
    metadatas = []
    ids = []

    id_counter = 1

    for country, details in data.items():
        for source in ['content', 'summary']:
            text = details[source]
            chunks = split_into_chunks(text)

            for idx, chunk in enumerate(chunks):
                documents.append(chunk)
                last_word_prev_chunk = 'None' if idx == 0 else chunks[idx-1].split()[-1]
                
                if idx < len(chunks) - 1:
                    next_chunk_first_words = ' '.join(chunks[idx+1].split()[:3])
                else:
                    next_chunk_first_words = 'None'
                
                metadata = {
                    "country": country,
                    "paragraph": idx + 1,
                    "last_word": chunk.split()[-1],
                    "next_words": next_chunk_first_words,
                    "last_word_prev_chunk": last_word_prev_chunk,
                    "source": source
                }
                metadatas.append(metadata)
                ids.append(f"{id_counter}")
                id_counter += 1

    return documents, metadatas, ids


documents, metadatas, ids = process_json(countries_information)


In [47]:
print(documents[2])
print(metadatas[2])
print(ids[3])

The name was given by Buluggin ibn Ziri after he established the city on the ruins of the Phoenician city of Icosium in 950. It was employed by medieval geographers such as Muhammad al-Idrisi and Yaqut al-Hamawi. The Ottoman Empire extended the name of al-Jazā'ir over the entire country, deriving it from the name of the capital city.Thus, it shares its etymology with numerous other places, such as Alzira in Valencia, Algeciras in Andalusia, Lezíria in Portugal, Cizre in Turkey, Gżira in Malta, the Nile island of Gezira in Egypt, and the state of Gezira in Sudan. History Prehistory and ancient history Around ~1.8-million-year-old stone artifacts from Ain Hanech (Algeria) were considered to represent the oldest archaeological materials in North Africa. Stone artifacts and cut-marked bones that were excavated from two nearby deposits at Ain Boucherit are estimated to be ~1.9 million years old, and even older stone artifacts to be as old as ~2.4 million years. Hence, the Ain Boucherit evid

In [54]:
from sentence_transformers import SentenceTransformer
sentences_1 = documents
model = SentenceTransformer('BAAI/bge-base-en-v1.5')
embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)

Downloading (…)db36e/.gitattributes: 100%|██████████| 1.52k/1.52k [00:00<?, ?B/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 197kB/s]
Downloading (…)88b99db36e/README.md: 100%|██████████| 90.2k/90.2k [00:00<00:00, 90.3MB/s]
Downloading (…)b99db36e/config.json: 100%|██████████| 777/777 [00:00<?, ?B/s] 
Downloading (…)ce_transformers.json: 100%|██████████| 124/124 [00:00<00:00, 121kB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:17<00:00, 25.5MB/s]]
Downloading (…)nce_bert_config.json: 100%|██████████| 52.0/52.0 [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 125/125 [00:00<?, ?B/s] 
Downloading (…)db36e/tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 3.05MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 366/366 [00:00<?, ?B/s] 
Downloading (…)88b99db36e/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 38.6MB/s]
Downloading (…)99db36e/modules.json: 100%|██████████| 349/349 [00:00<?, ?B/s] 


In [61]:
embeddings = embeddings_1.tolist()
len(embeddings)

4717

In [60]:
collection.add(embeddings=embeddings, documents=documents, metadatas=metadatas, ids=ids)

In [62]:
collection.query(query_texts=["What is the capital of France?"], n_results=10)

{'ids': [['2293',
   '2347',
   '2346',
   '2327',
   '2311',
   '2295',
   '2345',
   '2331',
   '2320',
   '2070']],
 'distances': [[0.5583115816116333,
   0.5583115816116333,
   0.7429412007331848,
   0.746076226234436,
   0.7520112991333008,
   0.7665204405784607,
   0.7938966751098633,
   0.8169093132019043,
   0.8186788558959961,
   0.8273496627807617]],
 'metadatas': [[{'country': 'France',
    'last_word': 'War.',
    'last_word_prev_chunk': 'None',
    'next_words': 'In the 16th',
    'paragraph': 1,
    'source': 'content'},
   {'country': 'France',
    'last_word': 'War.',
    'last_word_prev_chunk': 'None',
    'next_words': 'In the 16th',
    'paragraph': 1,
    'source': 'summary'},
   {'country': 'France',
    'last_word': 'Office',
    'last_word_prev_chunk': 'flag.',
    'next_words': 'None',
    'paragraph': 54,
    'source': 'content'},
   {'country': 'France',
    'last_word': 'life.',
    'last_word_prev_chunk': 'France.',
    'next_words': 'The Industrial Revoluti