In [29]:
!pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [30]:
## Get API from 'apikey' file
filename = '../apikey'
def get_file_contents(filename):
    """ Given a filename,
        return the contents of that file
    """
    try:
        with open(filename, 'r') as f:
            return f.read().strip()
    except FileNotFoundError:
        print("'%s' file not found" % filename)
api_key = get_file_contents(filename)
# print("Your API key is: %s" % (api_key))

from openai import OpenAI

client = OpenAI(
  api_key= api_key
)

In [31]:
import pandas as pd

df = pd.read_csv("../example/data/Stock_price_TSLA.csv")

In [32]:
def base_summary(df: pd.DataFrame, n_samples=3) -> list:
    def get_samples(column):
        non_null = column.dropna().unique()
        return non_null[:n_samples].tolist()

    properties_list = []
    for column in df.columns:
        col_data = df[column]
        dtype = col_data.dtype

        # Determine the data type
        if dtype in [int, float, complex]:
            dtype_str = "number"
            min_val = float(col_data.min()) if dtype == float else int(col_data.min())
            max_val = float(col_data.max()) if dtype == float else int(col_data.max())
        elif dtype == bool:
            dtype_str, min_val, max_val = "boolean", None, None
        elif pd.api.types.is_datetime64_any_dtype(col_data) or (dtype == object and pd.to_datetime(col_data, errors='coerce').notna().any()):
            dtype_str, min_val, max_val = "date", col_data.min(), col_data.max()
        elif dtype == object:
            dtype_str, min_val, max_val = "string" if col_data.nunique() / len(col_data) >= 0.5 else "category", None, None
        else:
            dtype_str, min_val, max_val = str(dtype), None, None

        properties = {
            "dtype": dtype_str,
            "min": min_val,
            "max": max_val,
            "num_unique_values": col_data.nunique(),
            "samples": get_samples(col_data),
            "semantic_type": "",
            "description": ""
        }

        properties_list.append({"column": column, "properties": properties})

    return properties_list

In [33]:
# Calculating Daily Range
df['Daily Range'] = df['High'] - df['Low']

# Calculating Percentage Change
df['Percentage Change'] = (df['Close'] - df['Open']) / df['Open']

# Calculating Volume Weighted Average Price
df['Volume Weighted Average Price'] = (df['Close'] * df['Volume']).cumsum() / df['Volume'].cumsum()

In [34]:
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Daily Range,Percentage Change,Volume Weighted Average Price
0,2022-11-21,175.850006,176.770004,167.539993,167.869995,167.869995,92882700,9.230011,-0.045380,167.869995
1,2022-11-22,168.630005,170.919998,166.190002,169.910004,169.910004,78452300,4.729996,0.007591,168.804091
2,2022-11-23,173.570007,183.619995,172.500000,183.199997,183.199997,109536700,11.119995,0.055482,174.418327
3,2022-11-25,185.059998,185.199997,180.630005,182.860001,182.860001,50672700,4.569992,-0.011888,175.708539
4,2022-11-28,179.960007,188.500000,179.000000,182.919998,182.919998,92905200,9.500000,0.016448,177.287011
...,...,...,...,...,...,...,...,...,...,...
245,2023-11-13,215.600006,225.399994,211.610001,223.710007,223.710007,140447600,13.789993,0.037616,203.113584
246,2023-11-14,235.029999,238.139999,230.720001,237.410004,237.410004,149771600,7.419998,0.010126,203.263701
247,2023-11-15,239.289993,246.699997,236.449997,242.839996,242.839996,150354000,10.250000,0.014836,203.436842
248,2023-11-16,239.490005,240.880005,230.960007,233.589996,233.589996,136816800,9.919998,-0.024636,203.556404


In [35]:
summary = base_summary(df, 3)

## Insightful Question

In [60]:
SYSTEM_INSTRUCTIONS = """
You are an experienced financial data analyst who can generate a given number of insightful QUESTIONS about data, when given a summary of the data. 
Each question should refer to one visualization (THE VISUALIZATION MUST REFERENCE THE EXACT COLUMN FIELDS FROM THE SUMMARY), and a rationale (JUSTIFICATION FOR WHICH dataset FIELDS ARE USED and what we will learn from the visualization). 
Each question MUST mention the exact fields from the dataset summary above
The VISUALIZATIONS YOU RECOMMEND MUST FOLLOW VISUALIZATION BEST PRACTICES (e.g., must use bar charts instead of pie charts for comparing quantities) AND BE MEANINGFUL 
(e.g., plot longitude and latitude on maps where appropriate). 
MAKE SURE THE QUESTION IS A FINANCIAL/Economic QUESTION INSTEAD OF A STATISTIC QUESTION.
"""

In [61]:
FORMAT_INSTRUCTIONS = """
THE OUTPUT MUST BE A CODE SNIPPET OF A VALID LIST OF JSON OBJECTS. IT MUST USE THE FOLLOWING FORMAT:

```[
    { "index": 0,  "question": "How", "visualization": "histogram of X", "rationale": "This tells about "} ..
    ]
```
THE OUTPUT SHOULD ONLY USE THE JSON FORMAT ABOVE.
"""

In [62]:
n = 5

user_prompt = f"""
The number of QUESTIONS to generate is {n}. The questions should be based on the data summary below, \n\n .
{summary} \n\n

MAKE SURE THE QUESTION IS A FINANCIAL/Economic QUESTION INSTEAD OF A STATISTIC QUESTION.
"""

In [63]:

completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages = [
    {"role": "system", "content": SYSTEM_INSTRUCTIONS},
    {"role": "assistant",
        "content":
        f"{user_prompt}\n\n {FORMAT_INSTRUCTIONS} \n\n. The generated {n} insights are: \n "}]
  )

print(completion.choices[0].message.content)

[
    { "index": 0,  "question": "What is the range of daily percentage change in the stock prices?", "visualization": "line chart of Percentage Change", "rationale": "This visualization will show the fluctuations in the percentage change of the stock prices over time, allowing us to understand the volatility of the stock."},
    { "index": 1,  "question": "What is the average daily range of the stock prices?", "visualization": "bar chart of Daily Range", "rationale": "This visualization will show the average range of the stock prices each day, allowing us to understand the price movement within a day."},
    { "index": 2,  "question": "What is the average daily volume of the stock?", "visualization": "bar chart of Volume", "rationale": "This visualization will show the average volume of the stock traded each day, allowing us to understand the level of activity in the market."},
    { "index": 3,  "question": "What is the distribution of daily stock prices?", "visualization": "histogra