# Utils functions

In [None]:
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import aiohttp
import asyncio
import async_lru
import numpy as np
import os
import io
import statsmodels.api as sm
from scipy.stats import norm

In [None]:
@async_lru.alru_cache(maxsize=128)
async def get_data(url: str) -> bytes:
    """Gets data from URL

    Args:
        url (str): URL to query

    Returns:
        bytes: pickled data
    """
    print(f"Getting data from {url}")
    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers={"Accept": "python/pickle"}) as response:
            return await response.read()


@async_lru.alru_cache(maxsize=128)
async def head_data(url: str) -> dict:
    """Gets data from URL

    Args:
        url (str): URL to query

    Returns:
        dict: headers of the response
    """
    print(f"Heading data from {url}")
    async with aiohttp.ClientSession() as session:
        async with session.head(url, headers={"Accept": "python/pickle"}) as response:
            return dict(response.headers.items())

# Prices

In [None]:
time_begin = 2009


async def get_prices() -> pd.DataFrame:
    """Get all prices

    Returns:
        pd.DataFrame: Dataframe of prices
    """
    base_url = "https://high-frequency-data.shriimpe.fr/api/data/price?start_date={}-01-01&end_date={}-12-31"

    requests = [
        get_data(base_url.format(year, year))
        for year in range(time_begin, datetime.now().year + 1)
    ]

    results = []

    for request in asyncio.as_completed(requests, timeout=600):
        buf = io.BytesIO(await request)
        results.append(pd.read_pickle(buf))

    return pd.concat(results)


async def head_prices() -> dict:
    """Process HEAD request for prices

    Yields:
        Iterator[dict]: headers of requests
    """
    base_url = "https://high-frequency-data.shriimpe.fr/api/data/price?start_date={}-01-01&end_date={}-12-31"

    requests = [
        head_data(base_url.format(year, year))
        for year in range(time_begin, datetime.now().year + 1)
    ]

    for request in asyncio.as_completed(requests, timeout=600):
        yield await request

In [None]:
if not os.path.exists(f"data/prices_{datetime.today().strftime("%Y-%m-%d")}.pkl"):
    async for headers in head_prices():
        display(headers)

In [None]:
if os.path.exists(f"data/prices_{datetime.today().strftime("%Y-%m-%d")}.pkl"):
    print("Reading prices from Pickle file")
    print("File size:", os.path.getsize(f"data/prices_{datetime.today().strftime("%Y-%m-%d")}.pkl") / 1e6, "MB")
    prices = pd.read_pickle(f"data/prices_{datetime.today().strftime("%Y-%m-%d")}.pkl")
else:
    print("Removing previous files...")
    files = os.listdir("data/")
    for file in files:
        if file.startswith("prices_") and file.endswith(".pkl"):
            print(f"Removing {file}")
            os.remove(file)
    
    print("Gathering data from API...")
    prices = await get_prices()

    prices.drop(columns=["volume"], inplace=True)

    prices["date_time"] = pd.to_datetime(prices["date_time"])

    prices.sort_values(by="date_time", inplace=True)

    prices.reset_index(drop=True, inplace=True)
    prices.to_pickle(f"data/prices_{datetime.today().strftime("%Y-%m-%d")}.pkl")


print(f"{prices.shape=}")

In [None]:
display(prices.head())

In [None]:
display(prices.describe())

In [None]:
display(prices.info())

In [None]:
diff = prices["price"].diff()
display(diff.loc[diff > 10].index)
prices.drop(diff.loc[diff > 10].index, inplace=True)
prices.drop(prices[prices["price"] < 45].index, inplace=True)

In [None]:
daily = prices[
    prices["date_time"].apply(
        lambda x: x.hour == 9 and x.minute == 30 and x.second == 0
    )
]

In [None]:
plt.figure(figsize=(20, 10))
plt.title("Prices")
plt.xlabel("Date")
plt.ylabel("Price")
plt.plot(prices["date_time"], prices["price"])
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
plt.title("Prices from high frequency data")
plt.xlabel("Date")
plt.ylabel("Price")
plt.plot(prices["date_time"], prices["price"], label="High frequency prices")
plt.plot(
    daily["date_time"],
    daily["price"],
    label="Daily opening prices",
    color="red",
    alpha=0.5,
)
plt.legend()
plt.show()

In [None]:
day = prices[
    (prices["date_time"] >= datetime(2022, 1, 4))
    & (prices["date_time"] <= datetime(2022, 1, 4, 16))
].copy(deep=True)

In [None]:
plt.figure(figsize=(20, 10))
plt.title("Prices from high frequency data over one day")
plt.xlabel("Date")
plt.ylabel("Price")
plt.plot(day["date_time"], day["price"], label="High frequency prices")
plt.legend()
plt.show()

# Log prices

In [None]:
prices["log_prices"] = np.log(prices["price"])
day["log_prices"] = np.log(day["price"])

In [None]:
plt.figure(figsize=(20, 10))
plt.title("Log prices")
plt.xlabel("Date")
plt.ylabel("Log price")
plt.plot(prices["date_time"], prices["log_prices"])
plt.show()

# Frequency variation

In [None]:
prices_variable_freq: dict[str, pd.DataFrame] = {}
for interval in (
    "10 seconds",
    "20 seconds",
    "30 seconds",
    "35 seconds",
    "40 seconds",
    "45 seconds",
    "50 seconds",
    "55 seconds",
    "1 minutes",
    "90 seconds",
    "2 minutes",
    "150 seconds",
    "3 minutes",
    "4 minutes",
    "5 minutes",
    "6 minutes",
    "7 minutes",
    "8 minutes",
    "10 minutes",
    "11 minutes",
    "12 minutes",
    "13 minutes",
    "14 minutes",
    "15 minutes",
):
    dates = [day.iloc[0, 0]]

    for date in day["date_time"]:
        if date - dates[-1] >= pd.Timedelta(interval):
            dates.append(date)

    prices_variable_freq[interval] = day[day["date_time"].isin(dates)]

In [None]:
volatilities = {}
for interval in prices_variable_freq:
    volatilities[interval] = np.sum(
        ((prices_variable_freq[interval]["log_prices"].diff().dropna())) ** 2
    )

display(volatilities)

In [None]:
plt.figure(figsize=(20, 10))
plt.title("Realized volatility dynamics")
plt.plot(list(volatilities.keys()), list(volatilities.values()), "g+-")
plt.show()

In [None]:
diffs = {}
for interval in prices_variable_freq:
    diffs[interval] = prices_variable_freq[interval]["log_prices"].diff().dropna()

In [None]:
covariances = {}
for interval in diffs:
    covariances[interval] = np.cov(diffs[interval])

In [None]:
plt.figure(figsize=(20, 10))
plt.title("Covariance dynamics")
plt.plot(list(covariances.keys()), list(covariances.values()), "r+-")
plt.show()

# Microstructure noise

In [None]:
prices["log_returns"] = np.log(10 + prices["price"].diff())

In [None]:
plt.figure(figsize=(20, 10))
plt.title("Log returns")
plt.xlabel("Date")
plt.ylabel("Log returns")
plt.plot(prices["date_time"], prices["log_returns"])
plt.show()

In [None]:
acf = sm.tsa.acf(prices["log_returns"], missing="conservative", nlags=100)
acf

In [None]:
plt.figure(figsize=(20, 10))
plt.stem(range(2, len(acf)), acf[2:])
plt.title("Autocorrelation Function (ACF): {} values".format(len(acf)))
plt.xlabel("Lag")
plt.ylabel("Autocorrelation")
plt.show()

# Microstructure noise size

In [None]:
last_year = prices[prices["date_time"] >= datetime(datetime.today().year - 1, 1, 1)]
last_year.describe()

In [None]:
volatility_per_day = {}

for day in last_year["date_time"].dt.date.unique():
    data: pd.DataFrame = last_year[last_year["date_time"].dt.date == day]["log_returns"]
    volatility_per_day[day] = (np.sum((data.diff().dropna()) ** 2), len(data))

In [None]:
microsturcture_noise_size = {}
for vol in volatility_per_day:
    microsturcture_noise_size[vol] = np.sqrt(
        1 / (2 * volatility_per_day[vol][1]) * volatility_per_day[vol][0]
    )

In [None]:
plt.figure(figsize=(20, 10))
plt.title("Microstructure noise size per day")
plt.xlabel("Date")
plt.ylabel("Microstructure noise size")
plt.plot(
    list(microsturcture_noise_size.keys()),
    list(microsturcture_noise_size.values()),
    "r+-",
)
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
plt.title("Histogram of microstructure noise size")
plt.xlabel("Microstructure noise size")
plt.ylabel("Frequency")
plt.axis(xmin=0, xmax=max(microsturcture_noise_size.values()))
plt.hist(microsturcture_noise_size.values(), bins=100, density=True, label="Data")
plt.plot(
    np.linspace(0, max(microsturcture_noise_size.values()), 1000),
    norm.pdf(
        np.linspace(0, max(microsturcture_noise_size.values()), 1000),
        loc=np.mean(list(microsturcture_noise_size.values())),
        scale=np.std(list(microsturcture_noise_size.values())),
    ),
    label="Fitted curve",
)
print(
    "Mean:",
    np.mean(list(microsturcture_noise_size.values())),
    "\nStandard deviation:",
    np.std(list(microsturcture_noise_size.values())),
)
plt.legend()
plt.show()

# Estimated daily volatility

In [None]:
plt.figure(figsize=(20, 10))
plt.title("Volatility per day")
plt.xlabel("Date")
plt.ylabel("Volatility")
plt.plot(
    list(volatility_per_day.keys()), [x[0] for x in volatility_per_day.values()], "g+-"
)
plt.show()