## Data analysis and visualisation with an Open AI assistant


#### Import libraries
Note: openai==1.2.4 was required here

In [4]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from openai.types import Image, ImagesResponse
import json
from IPython.display import display, Markdown

#### Define a series of helper functions

In [46]:
# Capture json output from Assitants API
def show_json(obj):
    display(json.loads(obj.model_dump_json()))

# Print messages from threads so you can read them
def pretty_print(messages):
    print("# Messages")
    for m in messages:
        try:
            print(f"{m.role}: {m.content[0].text.value}")
        except:
            pass
    print()

# Output any image created by the assistant
def save_images(open_ai_file_name, desired_file_name):
    client = OpenAI()

    image_data = client.files.content(open_ai_file_name)
    image_data_bytes = image_data.read()

    with open(f"./{desired_file_name}.png", "wb") as file:
        file.write(image_data_bytes)

# Retrieve the thread from the assistants api
def get_response(thread):
    return client.beta.threads.messages.list(thread_id=thread.id, order="asc")

#### Define variables
Note: Open api key is called from .env file - you will need to create your own .env file and obtain your own key

In [6]:
load_dotenv()
OPENAI_MODEL = "gpt-4"
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
client = OpenAI()

#### Assign the input file
This is a file a downloaded from the World Bank Development Indicators dataset [WDI](https://databank.worldbank.org/source/world-development-indicators)

In [28]:

file_path = "./data/API_NY.GDP.PCAP.PP.CD_DS2_en_csv_v2_5728866.csv"

file = client.files.create(
  file=open(file_path, "rb"),
  purpose='assistants'
)

#### Create the assistant
This will also appear in Open AI UI for Assiants when you are logged in

In [8]:
assistant = client.beta.assistants.create(
  name="Data visualizer",
  description="You are great at creating beautiful data visualizations. You analyze data present in .csv files, understand trends, and come up with data visualizations relevant to those trends. You also share a brief text summary of the trends observed.",
  model="gpt-4-1106-preview",
  tools=[{"type": "code_interpreter"}],
  file_ids=[file.id]
)

#### Create a thread

In [9]:

thread = client.beta.threads.create()

#### Add a message to the thread
Note: Here I instruct the assitant to skip the forst 4 rows, previously it did work out that this needed to be done, but you use a lot of tokens and it doesn't always work

In [10]:
message = client.beta.threads.messages.create(
      thread_id=thread.id,
      role = "user",
      content = "Analyse the data for Switzerland and the UK. skip the first 4 rows when reading data in",
      file_ids= [file.id]
)

#### submit messages to the agent

In [11]:

run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

#### retrieve messages from assistant

In [12]:
messages = client.beta.threads.messages.list(thread_id=thread.id)
show_json(messages)

{'data': [{'id': 'msg_v7TK979CgP7oDvpFSfFZocYk',
   'assistant_id': None,
   'content': [{'text': {'annotations': [],
      'value': 'Analyse the data for Switzerland and the UK. skip the first 4 rows when reading data in'},
     'type': 'text'}],
   'created_at': 1699972512,
   'file_ids': ['file-tYweUlUpLBcPBOHffpeCJ9vF'],
   'metadata': {},
   'object': 'thread.message',
   'role': 'user',
   'run_id': None,
   'thread_id': 'thread_woqFhBQoDqGgfZtVCsRxClb3'}],
 'object': 'list',
 'first_id': 'msg_v7TK979CgP7oDvpFSfFZocYk',
 'last_id': 'msg_v7TK979CgP7oDvpFSfFZocYk',
 'has_more': False}

In [13]:
pretty_print(get_response(thread))

# Messages
user: Analyse the data for Switzerland and the UK. skip the first 4 rows when reading data in
assistant: The data appears to be structured with countries listed in the rows and years as columns. Each row corresponds to a specific country with information about its GDP per capita, PPP (current international $) over the years. Let's now filter out the information for Switzerland and the United Kingdom (UK).
assistant: We have successfully isolated the data for Switzerland and the United Kingdom. Each dataset contains the GDP per capita, PPP (current international $) for the respective country, along with a time series of these values across multiple years. There are some missing values in the earlier years, which is understandable given historical data availability.

Do you have specific analyses or a specific time range in mind for these two countries?



#### Add further instructions

In [14]:
message = client.beta.threads.messages.create(
      thread_id=thread.id,
      role = "user",
      content = "Calculate annual average growth in GDP per capita for the UK and Switzerland for each decade from 1900 to 2020",
      file_ids= [file.id]
)

In [15]:
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

In [16]:
messages = client.beta.threads.messages.list(thread_id=thread.id)
show_json(messages)

{'data': [{'id': 'msg_m7Ba6kJrcj5CyiJ2uzdbdCB6',
   'assistant_id': 'asst_4xBr4555QJc29a7FMddTngC7',
   'content': [{'text': {'annotations': [], 'value': ''}, 'type': 'text'}],
   'created_at': 1699972565,
   'file_ids': [],
   'metadata': {},
   'object': 'thread.message',
   'role': 'assistant',
   'run_id': 'run_yJFFwzHGnKJifI6EofWVw9xk',
   'thread_id': 'thread_woqFhBQoDqGgfZtVCsRxClb3'},
  {'id': 'msg_tsF3wI4kMM8Ih54VWOGx2Bi5',
   'assistant_id': None,
   'content': [{'text': {'annotations': [],
      'value': 'Calculate annual average growth in GDP per capita for the UK and Switzerland for each decade from 1900 to 2020'},
     'type': 'text'}],
   'created_at': 1699972560,
   'file_ids': ['file-tYweUlUpLBcPBOHffpeCJ9vF'],
   'metadata': {},
   'object': 'thread.message',
   'role': 'user',
   'run_id': None,
   'thread_id': 'thread_woqFhBQoDqGgfZtVCsRxClb3'},
  {'id': 'msg_oIYfrK1GVWE3twCNSlaZ8E7m',
   'assistant_id': 'asst_4xBr4555QJc29a7FMddTngC7',
   'content': [{'text': {'ann

In [18]:
pretty_print(get_response(thread))

# Messages
user: Analyse the data for Switzerland and the UK. skip the first 4 rows when reading data in
assistant: The data appears to be structured with countries listed in the rows and years as columns. Each row corresponds to a specific country with information about its GDP per capita, PPP (current international $) over the years. Let's now filter out the information for Switzerland and the United Kingdom (UK).
assistant: We have successfully isolated the data for Switzerland and the United Kingdom. Each dataset contains the GDP per capita, PPP (current international $) for the respective country, along with a time series of these values across multiple years. There are some missing values in the earlier years, which is understandable given historical data availability.

Do you have specific analyses or a specific time range in mind for these two countries?
user: Calculate annual average growth in GDP per capita for the UK and Switzerland for each decade from 1900 to 2020
assistan

In [19]:
message = client.beta.threads.messages.create(
      thread_id=thread.id,
      role = "user",
      content = "Plot a chart of GDP per Capita over time for Switzerland and the UK",
      file_ids= [file.id]
)
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)
messages = client.beta.threads.messages.list(thread_id=thread.id)
show_json(messages)

{'data': [{'id': 'msg_ZNrl1ZAqaH2KAlEfjAPm019e',
   'assistant_id': None,
   'content': [{'text': {'annotations': [],
      'value': 'Plot a chart of GDP per Capita over time for Switzerland and the UK'},
     'type': 'text'}],
   'created_at': 1699972655,
   'file_ids': ['file-tYweUlUpLBcPBOHffpeCJ9vF'],
   'metadata': {},
   'object': 'thread.message',
   'role': 'user',
   'run_id': None,
   'thread_id': 'thread_woqFhBQoDqGgfZtVCsRxClb3'},
  {'id': 'msg_NlFMUGoBeUXqtyebWTtTMnSm',
   'assistant_id': 'asst_4xBr4555QJc29a7FMddTngC7',
   'content': [{'text': {'annotations': [],
      'value': 'Here are the annual average growth rates in GDP per capita, expressed as percentages, for each decade where data is available:\n\n| Decade | Switzerland | United Kingdom |\n|--------|-------------|----------------|\n| 1960s  | N/A         | N/A            |\n| 1970s  | N/A         | N/A            |\n| 1980s  | N/A         | N/A            |\n| 1990s  | 2.01%       | 3.67%          |\n| 2000s  | 3

#### Extracting images
1. Find the messages containing an image
2. extract the filename
3. Decode and save that image

In [20]:
messages = client.beta.threads.messages.list(thread_id=thread.id, order="asc")
for m in messages:
    print(m.content)

[MessageContentText(text=Text(annotations=[], value='Analyse the data for Switzerland and the UK. skip the first 4 rows when reading data in'), type='text')]
[MessageContentText(text=Text(annotations=[], value="The data appears to be structured with countries listed in the rows and years as columns. Each row corresponds to a specific country with information about its GDP per capita, PPP (current international $) over the years. Let's now filter out the information for Switzerland and the United Kingdom (UK)."), type='text')]
[MessageContentText(text=Text(annotations=[], value='We have successfully isolated the data for Switzerland and the United Kingdom. Each dataset contains the GDP per capita, PPP (current international $) for the respective country, along with a time series of these values across multiple years. There are some missing values in the earlier years, which is understandable given historical data availability.\n\nDo you have specific analyses or a specific time range in

In [21]:
file_name = m.content[-2].image_file.file_id
print(m.content[-2].image_file.file_id)

file-8mrc4f1I64x45z9npEWG6BaV


In [25]:
save_images(file_name,"ch_uk_gdp_per_capita")

#### Add further instructions

In [26]:
message = client.beta.threads.messages.create(
      thread_id=thread.id,
      role = "user",
      content = "“Forecast GDP per Capita growth rates for the next five years",
      file_ids= [file.id]
)
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)
messages = client.beta.threads.messages.list(thread_id=thread.id)

In [29]:
messages = client.beta.threads.messages.list(thread_id=thread.id, order="asc")
for m in messages:
    print(m.content)

[MessageContentText(text=Text(annotations=[], value='Analyse the data for Switzerland and the UK. skip the first 4 rows when reading data in'), type='text')]
[MessageContentText(text=Text(annotations=[], value="The data appears to be structured with countries listed in the rows and years as columns. Each row corresponds to a specific country with information about its GDP per capita, PPP (current international $) over the years. Let's now filter out the information for Switzerland and the United Kingdom (UK)."), type='text')]
[MessageContentText(text=Text(annotations=[], value='We have successfully isolated the data for Switzerland and the United Kingdom. Each dataset contains the GDP per capita, PPP (current international $) for the respective country, along with a time series of these values across multiple years. There are some missing values in the earlier years, which is understandable given historical data availability.\n\nDo you have specific analyses or a specific time range in

In [39]:
pretty_print(get_response(thread))

# Messages
user: Analyse the data for Switzerland and the UK. skip the first 4 rows when reading data in
assistant: The data appears to be structured with countries listed in the rows and years as columns. Each row corresponds to a specific country with information about its GDP per capita, PPP (current international $) over the years. Let's now filter out the information for Switzerland and the United Kingdom (UK).
assistant: We have successfully isolated the data for Switzerland and the United Kingdom. Each dataset contains the GDP per capita, PPP (current international $) for the respective country, along with a time series of these values across multiple years. There are some missing values in the earlier years, which is understandable given historical data availability.

Do you have specific analyses or a specific time range in mind for these two countries?
user: Calculate annual average growth in GDP per capita for the UK and Switzerland for each decade from 1900 to 2020
assistan

In [40]:
message = client.beta.threads.messages.create(
      thread_id=thread.id,
      role = "user",
      content = "Plot charts showing the linear trend used to create the forecasts",
      file_ids= [file.id]
)
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)
messages = client.beta.threads.messages.list(thread_id=thread.id)

In [41]:
messages = client.beta.threads.messages.list(thread_id=thread.id, order="asc")
for m in messages:
    print(m.content)

[MessageContentText(text=Text(annotations=[], value='Analyse the data for Switzerland and the UK. skip the first 4 rows when reading data in'), type='text')]
[MessageContentText(text=Text(annotations=[], value="The data appears to be structured with countries listed in the rows and years as columns. Each row corresponds to a specific country with information about its GDP per capita, PPP (current international $) over the years. Let's now filter out the information for Switzerland and the United Kingdom (UK)."), type='text')]
[MessageContentText(text=Text(annotations=[], value='We have successfully isolated the data for Switzerland and the United Kingdom. Each dataset contains the GDP per capita, PPP (current international $) for the respective country, along with a time series of these values across multiple years. There are some missing values in the earlier years, which is understandable given historical data availability.\n\nDo you have specific analyses or a specific time range in

In [43]:
m.content[-3].image_file.file_id

'file-Wm6U6UmIX6kgBMnjzpMUQl27'

In [44]:
file_name_01 = m.content[-2].image_file.file_id
file_name_02 = m.content[-3].image_file.file_id
print(file_name_01)
print(file_name_02)

file-LoYF2Zpq9VKk1iXdPRYNgM1W
file-Wm6U6UmIX6kgBMnjzpMUQl27


In [47]:
save_images(file_name_01,"uk_forecast")
save_images(file_name_02,"ch_forecast")