In this notebook we compare the statistical properties for the following bar sampling techniques:

- Time bars
- Volume bars
- Dollar bars

We look into the following statistical properties:

- Normality of (log) returns.
- Serial correlation.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlfinlab as ml
from alpha_vantage.timeseries import TimeSeries
%matplotlib inline



In [2]:
#set credentials for stock API
ts = TimeSeries(key = 'PTME3FLJAHB9CY', output_format = 'pandas')

In [34]:
#define global variables
DATE = "date"
PRICE = "4. close"
VOLUME = "5. volume"

In [4]:
#load  daily sample data
df, df_meta = ts.get_daily_adjusted(symbol = 'MSFT', outputsize = 'full')

In [27]:
#load minutely sample data
df, df_meta = ts.get_intraday(symbol = "MSFT", interval = "1min", outputsize = "full")

In [31]:
df

Unnamed: 0_level_0,1. open,2. high,3. low,4. close,5. volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-24 16:00:00,164.8900,165.0600,164.870,165.0400,511882.0
2020-01-24 15:59:00,164.9800,165.0000,164.860,164.8900,385888.0
2020-01-24 15:58:00,165.1100,165.1700,164.960,164.9600,259380.0
2020-01-24 15:57:00,165.1150,165.1800,165.095,165.1050,102700.0
2020-01-24 15:56:00,165.1801,165.2300,165.080,165.1100,168237.0
...,...,...,...,...,...
2020-01-17 09:35:00,166.6800,166.6800,166.680,166.6800,142245.0
2020-01-17 09:34:00,166.8500,166.9000,166.550,166.5877,172559.0
2020-01-17 09:33:00,166.9000,166.9475,166.720,166.8400,145436.0
2020-01-17 09:32:00,167.1200,167.1599,166.880,166.8801,205048.0


In [28]:
def get_sample_df(df, date_col, price_col, vol_col):
    df = df.reset_index()[[date_col, price_col, vol_col]]
    return df

In [29]:
def returns(df, price_col, log = False):
    if log:
        df[price_col] = np.log(df[price_col])
    
    df["return"] = df[price_col].diff(-1)
    return df

In [93]:
sample_df = get_sample_df(df, DATE, PRICE, VOLUME)

In [103]:
vol_bars = ml.data_structures.get_volume_bars(sample_df, 200000)
vol_bars.shape

Reading data in batches:
Batch number: 0
Returning bars 



(367, 10)

In [105]:
dollar_bars = ml.data_structures.get_dollar_bars(sample_df, threshold = 20000000)
dollar_bars.shape

Reading data in batches:
Batch number: 0
Returning bars 



(558, 10)

In [107]:
fig, ax = plt.subplots(figsize = (10, 6))
ax.plot(vol_bars["date_time"], vol_bars["close"], "o")
ax.plot(dollar_bars["date_time"], dollar_bars["close"], "x")
ax.plot(sample_df[PRICE])

[<matplotlib.lines.Line2D at 0x134402b10>]

ValueError: view limit minimum -36872.433333333334 is less than 1 and is an invalid Matplotlib date value. This often happens if you pass a non-datetime value to an axis that has datetime units

In [None]:
ax.plot()