In [40]:
"""
Implementation of a simple database by using hierarchical data format (HDF5)
"""

'\nImplementation of a simple database by using hierarchical data format (HDF5)\n'

In [41]:
import pandas as pd
import numpy as np
from pandas_datareader import data
import matplotlib.pyplot as plt
import h5py

In [42]:
# define a function to load our financial data and save it in a pickle file
# pickle file is fast, but it will occupy much space
# so we try h5 file
def load_financial_data(start_date,end_date,output_file):
    try:
        df = pd.read_pickle(output_file)
    except FileNotFoundError:
        df = data.DataReader('AMZN','yahoo',start_date,end_date)
        df.to_pickle(output_file)
    return df

In [43]:
amzn_data = load_financial_data(start_date = '2000-01-01',end_date = '2020-01-01',output_file = 'amzn.pkl')

In [44]:
# we transform the dataframe to hierarchical data format
# second parameter: identify for the group ---> 'amzn_data': key of the h5 file
amzn_data.to_hdf('amzn_data.h5','amzn_data',mode = 'w',format = 'table',data_columns = True) # save for data columns

  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


In [45]:
# we load our data from h5 file
# and check whether we can convert it into a dataframe
amzn_data_from_h5 = h5py.File('amzn_data.h5')

In [46]:
# we check the keys
print(list(amzn_data_from_h5['amzn_data'].keys()))

['_i_table', 'table']


In [47]:
amzn_data_from_h5['amzn_data']['_i_table'][:] 
# the first key is empty as we check
# it may relate to how pandas work with h5py

AttributeError: 'slice' object has no attribute 'encode'

In [31]:
# use 'amzn_data' as a key
# we can check how our data is saved
# the index is the timeframe in seconds
amzn_data_from_h5['amzn_data']['table'][:]

array([( 946857600000000000,  4.4781251 ,  3.95234394,  4.07499981,  4.46875   , 322352000,  4.46875   ),
       ( 946944000000000000,  4.57499981,  4.0875001 ,  4.26875019,  4.09687519, 349748000,  4.09687519),
       ( 947030400000000000,  3.7562499 ,  3.4000001 ,  3.5250001 ,  3.48749995, 769148000,  3.48749995),
       ...,
       (1577404800000000000, 95.06999969, 93.30049896, 94.14600372, 93.48999786, 123732000, 93.48999786),
       (1577664000000000000, 94.19999695, 92.03099823, 93.69999695, 92.34449768,  73494000, 92.34449768),
       (1577750400000000000, 92.66300201, 91.6115036 , 92.09999847, 92.39199829,  50130000, 92.39199829)],
      dtype=[('index', '<i8'), ('High', '<f8'), ('Low', '<f8'), ('Open', '<f8'), ('Close', '<f8'), ('Volume', '<i8'), ('Adj Close', '<f8')])

In [21]:
# then we check attributes in alphabet order
for attribute in amzn_data_from_h5['amzn_data']['table'].attrs.items(): # return a tuple
    print(attribute)

('Adj Close_dtype', b'float64')
('Adj Close_kind', b'(lp0\nVAdj Close\np1\na.')
('Adj Close_meta', b'N.')
('CLASS', b'TABLE')
('Close_dtype', b'float64')
('Close_kind', b'(lp0\nVClose\np1\na.')
('Close_meta', b'N.')
('FIELD_0_FILL', 0)
('FIELD_0_NAME', b'index')
('FIELD_1_FILL', 0.0)
('FIELD_1_NAME', b'High')
('FIELD_2_FILL', 0.0)
('FIELD_2_NAME', b'Low')
('FIELD_3_FILL', 0.0)
('FIELD_3_NAME', b'Open')
('FIELD_4_FILL', 0.0)
('FIELD_4_NAME', b'Close')
('FIELD_5_FILL', 0)
('FIELD_5_NAME', b'Volume')
('FIELD_6_FILL', 0.0)
('FIELD_6_NAME', b'Adj Close')
('High_dtype', b'float64')
('High_kind', b'(lp0\nVHigh\np1\na.')
('High_meta', b'N.')
('Low_dtype', b'float64')
('Low_kind', b'(lp0\nVLow\np1\na.')
('Low_meta', b'N.')
('NROWS', 5031)
('Open_dtype', b'float64')
('Open_kind', b'(lp0\nVOpen\np1\na.')
('Open_meta', b'N.')
('TITLE', Empty(dtype=dtype('S1')))
('VERSION', b'2.7')
('Volume_dtype', b'int64')
('Volume_kind', b'(lp0\nVVolume\np1\na.')
('Volume_meta', b'N.')
('index_kind', b'datetime6

In [None]:
# note: 
# 1. the probability of getting currpt data is high 
# 2. limited features ---> cannot remove arrays ---> doesn't not allow deletion
# 3. low performance ---> it doesn't use system caching (well, it is indeed quite slow)