This sample notebook shows how to analyse data fetched from the Data Lake.

It shows the correlation of the average price per minute between TSLA and AAPL in a given day.

In [None]:
# You must choose a particular feed and date to use for the queries in this notebook. 
# If you need a list of feeds, go to Help Menu -> "View Feed Data Permissions"

# An example is:
feed = 'xdp_nyse_integrated'
dt = '2025-02-06'
products = ['AAPL', 'TSLA']

In [None]:
import pandas as pd
import maystreet_data as md


def avg_price_per_minute():
    """
    Query the Data Lake for the average price per minute.

    Returns a Pandas dataframe with minute_timestamp, avg_aapl_price and avg_tsla_price.
    """

    query = f"""
    SELECT 
        DATE_TRUNC('minute', FROM_UNIXTIME(ExchangeTimestamp / 1000000000)) AS minute_timestamp,
        AVG(CASE WHEN product = '{products[0]}' THEN price ELSE NULL END) as avg_{products[0]}_price,
        AVG(CASE WHEN product = '{products[1]}' THEN price ELSE NULL END) as avg_{products[1]}_price        
    FROM 
        "prod_lake"."p_mst_data_lake".mt_trade
    WHERE 
        dt = '{dt}'
        AND f = '{feed}'
        AND product IN ({', '.join([f"'{p}'" for p in products])})
    GROUP BY 1
    ORDER BY 1
    """

    return pd.DataFrame(md.query(md.DataSource.DATA_LAKE, query))


# forward-fill, then backfill in case there was no trade in a given minute
prices = avg_price_per_minute().fillna(method="ffill").fillna(method="bfill")


In [None]:
import numpy as np


# Correlation matrix

np.corrcoef(prices[f"avg_{products[0]}_price"], prices[f"avg_{products[1]}_price"])


In [None]:
import matplotlib.pyplot as plt


# Chart the avg prices

plt.rcParams["figure.figsize"] = [10, 10]

fig, ax = plt.subplots()

fig.patch.set_facecolor((1, 1, 1))

ax.set_xlabel(f"Avg {products[0]} price")
ax.set_ylabel(f"Avg {products[1]} price")

ax.plot(prices[f"avg_{products[0]}_price"], prices[f"avg_{products[1]}_price"], linewidth=0, marker="s")

plt.show()
