In [None]:
# Importing the necessary libraries
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import aiohttp
import json
import asyncio

# Visualizing the data

In [None]:
async def get_data(year: int) -> pd.DataFrame:
    url = (
        "https://high-frequency-data.shriimpe.fr/api/data/price?start_date="
        + str(year)
        + "-01-01&end_date="
        + str(year)
        + "-12-31"
    )

    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            if response.status != 200:
                raise Exception(f"Error fetching data from {url}")
            data = await response.text()

    data = json.loads(data)
    return pd.DataFrame(data).drop(columns=["volume"])


dfs = await asyncio.gather(*[get_data(year) for year in range(2009, 2024)])

In [None]:
df = pd.concat(dfs)  # reading the file containg the data
# df.columns = ["Date", "Time", "price", "Bid", "Ask", "Volume"]
# df["Date"] = df["Date"].str.cat(df["Time"], sep=" ")
# # Droppong columns that are not relevant for this project
# df = df.drop(columns=["Time", "Bid", "Ask", "Volume"])
# df["Date"] = pd.to_datetime(df["Date"], format="%m/%d/%Y %H:%M:%S")
df.set_index("date_time", inplace=True)
df

In [None]:
# Plot the price based on the date
plt.figure(figsize=(30, 15))  # Size figure
plt.plot(df.index, df["price"], marker=".", linestyle="-", color="b", label="price")
plt.title("Evolution of the price of the ticker S&P500 Value Index (IVE)")
plt.xlabel("Date")
plt.ylabel("Close price")
plt.grid(True)
date_list = [
    datetime(2009, 1, 1) + timedelta(days=365 * i)
    for i in range((datetime(2023, 1, 1) - datetime(2009, 1, 1)).days // 365 + 1)
]
date_list_f = [date.strftime("%Y-%m-%d") for date in date_list]
plt.xticks(date_list)
plt.legend(fontsize=20)
plt.savefig("Image_1.png")
plt.show()

### Outliers 

We notice some exaggeration in the price data for a specific date within the year 2010.

The dates and prices for this time range are:

In [None]:
df[df["price"] <= 28]

We notice that this exaggeration is only present for the date 2010-05-06 at 2:00 PM, where prices move from more the $20 to $1.10.

To be sure about this we take into consideration another source of financial data: Yahoo Finance. 

In [None]:
import yfinance as yf

ticker_symbol = "IVE"  # S&P 500 Value Index ticker symbol
sp500_value_index = yf.Ticker(ticker_symbol)
# retrieve historical data
data_yfinance = sp500_value_index.history(period="20Y")  # "1d" for today's data

data_yfinance.index = data_yfinance.index.strftime("%Y-%m-%d")
data_yfinance.index = pd.to_datetime(data_yfinance.index, format="%Y-%m-%d")

# Selection of dates between 2009-09-28 and 2023-11-06
data_yfinance = data_yfinance[
    (data_yfinance.index >= datetime(2009, 9, 28))
    & (data_yfinance.index <= datetime(2023, 11, 6))
]
data_yfinance = data_yfinance.drop(
    columns=["Dividends", "Stock Splits", "Capital Gains"]
)

data_yfinance

In [None]:
# Plot the price based on the date
plt.figure(figsize=(30, 15))  # Size figure
plt.plot(
    data_yfinance.index,
    data_yfinance["Close"],
    marker="",
    linestyle="-",
    color="r",
    label="Close price",
)
plt.title(
    "Evolution of the Close price of the ticker S&P500 Value Index from Yahoo Finance"
)
plt.xlabel("Date")
plt.ylabel("Close price")
plt.grid()
date_list = [
    datetime(2009, 1, 1) + timedelta(days=365 * i)
    for i in range((datetime(2023, 1, 1) - datetime(2009, 1, 1)).days // 365 + 1)
]
date_list_f = [date.strftime("%Y-%m-%d") for date in date_list]
plt.xticks(date_list_f)
plt.legend(fontsize=20)
plt.savefig("Image_yfinance_IVE.png")
plt.show()

In [None]:
# Plotting on the same figure the prices from the two data sources
plt.figure(figsize=(30, 15))
plt.plot(
    data_yfinance.index,
    data_yfinance["Close"],
    marker="",
    linestyle="-",
    color="r",
    label="Close price",
)
plt.plot(df.index, df["price"], marker=".", linestyle="-", color="b", label="price")
plt.title(
    "Evolution of the Close price of the ticker S&P500 Value Index from Yahoo Finance and from Tick Data"
)
plt.xlabel("Date")
plt.ylabel("Close price")
plt.grid()
date_list = [
    datetime(2009, 1, 1) + timedelta(days=365 * i)
    for i in range((datetime(2023, 1, 1) - datetime(2009, 1, 1)).days // 365 + 1)
]
date_list_f = [date.strftime("%Y-%m-%d") for date in date_list]
plt.xticks(date_list_f)
plt.legend(fontsize=20)
plt.savefig("comparison_yfinance_data.png")
plt.show()

We take a look into the data from Yahoo finance for the specific date of the outlier:

In [None]:
data_yfinance[data_yfinance.index == "2010-05-06"]

We observe that for this same date, the data from Yahoo Finance indicates a low price of $21.50. Therefore, we choose to remove values for this date that exceed this amount

In [None]:
index_to_drop = df[df["price"] <= 28].index.to_list()
df.drop(index_to_drop, inplace=True)

We look for the price chart again once the outliers were deleted:

In [None]:
plt.figure(figsize=(30, 15))
plt.plot(df.index, df["price"], marker=".", linestyle="-", color="b", label="price")
plt.title(
    "Evolution of the price of the ticker S&P500 Value Index (IVE) without the outliers"
)
plt.xlabel("Date")
plt.ylabel("Close price")
plt.grid(True)
date_list = [
    datetime(2009, 1, 1) + timedelta(days=365 * i)
    for i in range((datetime(2023, 1, 1) - datetime(2009, 1, 1)).days // 365 + 1)
]
date_list_f = [date.strftime("%Y-%m-%d") for date in date_list]
plt.xticks(date_list)
plt.legend(fontsize=20)
plt.savefig("Image_1_without_outliers.png")
plt.show()

### Focusing on the last year 

In [None]:
df_last_y = df[df.index >= datetime(2022, 1, 1)]
data_yfinance_last_y = data_yfinance[data_yfinance.index >= datetime(2022, 1, 1)]

In [None]:
plt.figure(figsize=(30, 15))
plt.plot(df_last_y.index, df_last_y["price"], linestyle="-", color="b", label="price")
plt.plot(
    data_yfinance_last_y.index,
    data_yfinance_last_y["Close"],
    linestyle="-",
    color="r",
    label="price YFinance",
)
plt.title(
    "Evolution of the price of the ticker S&P500 Value Index (IVE) from last year until today (Yahoo Finance)"
)
plt.xlabel("Date")
plt.ylabel("Close price")
plt.grid(True)
# date_list = [datetime(2009, 1, 1) + timedelta(days=365 * i) for i in range((datetime(2023,1, 1) - datetime(2009, 1, 1)).days // 365 + 1)]
# date_list_f = [date.strftime("%Y-%m-%d") for date in date_list]
# plt.xticks(date_list)
plt.legend(fontsize=20)
plt.savefig("compare_yfinance_last_year_to_today.png")
plt.show()

# Realized Volatility 

### Returns for different observations frequency 

- Réaliser une fonction qui permet de renvoyer les retours selon les différentes fréquence
- Réaliser différentes fonction qui calculent la volatilité, selon différentes manières de calculs de la volatilité
- Déterminer la long-range volatility estimation 
- Déterminer la microstructure noise selon différentes façon 
- La cacluler pour différentes intervalles et trouver un pattern 
- Ploter la daily volatility pour l'année passée 

In [None]:
# different interval of frequencies taken into consideration
observation_frequencies = [
    "30 seconds",
    "1 minutes",
    "3 minutes",
    "5 minutes",
    "10 minutes",
    "15 minutes",
    "1 hours",
    "1 days",
]


# function returning the dataframe df with a new column returns depending on the frequency
def returns(frequency, df):
    dates = [df.index[0]]
    frequency = "30 seconds"
    for date in df.index:
        if date - dates[-1] >= pd.Timedelta(frequency):
            dates.append(date)

    partial_df = df[df.index.isin(dates)]

    partial_df["Returns"] = partial_df["price"].pct_change()
    partial_df

    return partial_df

In [None]:
frequency = observation_frequencies[0]
partial_df = returns(frequency, df)
plt.figure(figsize=(30, 15))
plt.plot(partial_df.index, partial_df["Returns"], label="Returns")
plt.title(
    "Returns of the ticker S&P500 Value Index (IVE) with a frequency of 30 seconds"
)
plt.xlabel("Date")
plt.ylabel("Returns")
plt.grid(True)
plt.legend(fontsize=15)
plt.show()

### Volatility based on different time range

In [None]:
time_range = {}
time_range["1 days"] = pd.Timedelta(days=1)
time_range["1 weeks"] = pd.Timedelta(weeks=1)
time_range["1 months"] = pd.Timedelta(days=30)
time_range["3 months"] = pd.Timedelta(days=90)
time_range["6 months"] = pd.Timedelta(days=180)
time_range["1 years"] = pd.Timedelta(days=365)

In [None]:
# function returning a list of dates between the start and end date with a given frequency
def date_range(start_date, end_date, time_r):
    dates = [start_date]
    date = start_date
    delta = time_range[time_r]
    while date < end_date:
        date += delta
        dates.append(date)
    return dates

In [None]:
# function returning a list of the volatility of the returns of the ticker S&P500 Value Index (IVE) with a given time range
# dataframe df containing already the returns for a specific frequency
def volatility(time_r, df_with_returns):
    start_date = df_with_returns.index[0]
    end_date = df_with_returns.index[-1]
    date_list = date_range(start_date, end_date, time_r)

    volatility = []
    for i in range(len(date_list) - 1):
        partial_df = df_with_returns[
            (df_with_returns.index >= date_list[i])
            & (df_with_returns.index < date_list[i + 1])
        ]
        volatility.append(partial_df["Returns"].std())
    return (volatility, date_list)

In [None]:
(daily_volatility, date_list) = volatility("1 days", partial_df)

In [None]:
plt.figure(figsize=(30, 15))
plt.plot(date_list[:-1], daily_volatility, label="Daily Volatility")
plt.title(
    "Daily Volatility of the ticker S&P500 Value Index (IVE) with a frequency of 30 seconds"
)
plt.xlabel("Date")
plt.ylabel("Volatility (%)")
plt.grid(True)
plt.legend(fontsize=15)
plt.show()

### Long-range volatility estimation 

In [None]:
# function that returns the long-range volatility of the ticker S&P500 Value Index (IVE) with a given time range based on daily returns
def long_range_volatility(long_range_time, df):
    # daily returns
    df_with_returns = returns("1 days", df)
    # calculating the volatility
    return volatility(long_range_time, df_with_returns)

In [None]:
(daily_volatility, date_list) = long_range_volatility("1 months", df)

# Market Noise Microstructure