In [96]:
from pathlib import Path
import gzip
import json

import pandas as pd
import h5py
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import datamine.io as dm

In [82]:
path = Path("/media/hemu/Data/Markets/Crypto/CME/CRYPTOCURRENCY")

files = list(path.iterdir())
files.sort()
print(*files[:5], sep='\n')
# df = pd.read_json(files[0])

/media/hemu/Data/Markets/Crypto/CME/CRYPTOCURRENCY/20170407_btcIndexJson.gz
/media/hemu/Data/Markets/Crypto/CME/CRYPTOCURRENCY/20170408_btcIndexJson.gz
/media/hemu/Data/Markets/Crypto/CME/CRYPTOCURRENCY/20170409_btcIndexJson.gz
/media/hemu/Data/Markets/Crypto/CME/CRYPTOCURRENCY/20170410_btcIndexJson.gz
/media/hemu/Data/Markets/Crypto/CME/CRYPTOCURRENCY/20170411_btcIndexJson.gz


In [3]:
# pd.read_json(files[0], )

In [4]:
myDatamine = dm.DatamineCon(username='hanwant', password='Yamankalyan1', path='/media/hemu/Data/Markets/Crypto/CME/')

In [5]:
cat = myDatamine.data_catalog
print(cat)

{}


In [6]:
dat = myDatamine.load_dataset("CRYPTOCURRENCY", download=True, limit=10, dataset_args={})
print(dat)

downloading CRYPTOCURRENCY data: 0it [00:00, ?it/s]
limiting to 10/997 files
reading CRYPTOCURRENCY data: 100%|██████████| 10/10 [00:03<00:00,  2.70it/s]
concatenating 10 dataframes


                          symbol    rptSeq  mdEntryPx   mdEntryTime  \
mdEntryDateTime                                                       
2017-04-21 00:00:00+00:00   BRTI  17434413    1235.64  00:00:00.000   
2017-04-21 00:00:01+00:00   BRTI  17434414    1235.66  00:00:01.000   
2017-04-21 00:00:02+00:00   BRTI  17434415    1235.67  00:00:02.000   
2017-04-21 00:00:03+00:00   BRTI  17434416    1235.63  00:00:03.000   
2017-04-21 00:00:04+00:00   BRTI  17434417    1235.67  00:00:04.000   
...                          ...       ...        ...           ...   
2017-04-30 23:59:54+00:00   BRTI  18298407    1363.34  23:59:54.000   
2017-04-30 23:59:55+00:00   BRTI  18298408    1363.52  23:59:55.000   
2017-04-30 23:59:56+00:00   BRTI  18298409    1363.46  23:59:56.000   
2017-04-30 23:59:57+00:00   BRTI  18298410    1363.50  23:59:57.000   
2017-04-30 23:59:58+00:00   BRTI  18298411    1363.16  23:59:58.000   

                          mdUpdateAction openCloseSettlFlag  netChgPrevDay  

In [210]:
DTYPES = {'category': ('mdEntryCode', 'mdEntryType', 'mdUpdateAction',
                       'symbol', 'openCloseSettlFlag'),
          'int64': ('rptSeq',),
          'float': ('netChgPrevDay', 'netPctChg', 'mdEntryPx'),
          'date:%Y%m%d_%H:%M:%S.%f': 'mdEntryDateTime'}

def read_raw_df(filename):
    with gzip.open(filename, 'rt', encoding='utf-8') as f:
        result = []
        for line in f:
            line = json.loads(line)
            if 'mdEntries' in line:
                result.append(line['mdEntries'][0])
        result = pd.DataFrame(result)
        result['mdEntryDateTime'] = result['mdEntryDate'] + '_' + result['mdEntryTime']
        result = result.drop(['mdEntryDate', 'mdEntryTime'], axis=1)
    return result
    
def process_raw_df(df, dtypes):
    for dtype, cols in dtypes.items():
        for col in ((cols,) if isinstance(cols, str) else cols):
            if col in df:
                if dtype.startswith('date'):
                    format = None if dtype == 'date' else dtype[5:]
                    df[col] = pd.to_datetime(df[col], format=format, utc=True, errors='ignore')
                else:
                    df[col] = df[col].astype(dtype, errors='ignore')
    return df

def load_df(filename, dtypes):
    df = read_raw_df(filename)
    df = process_raw_df(df, dtypes)
    return df

def partition_df(df):
    dfs={}
    symbols = df['symbol'].unique()
    for symbol in symbols:
        dfs[symbol] = df[df['symbol']==symbol]
    return dfs

def bounds_ok(df, hd_filename, key):
    """ assumes df is in cme format and hd file contains key 'timestamps' """
    if hd_filename.is_file():
#         start = pd.read_hdf(hd_filename, key=key, start=0, stop=1)['timestamps'].iloc[0]
#         df_end = df['mdEntryDatetime'].iloc[-1].value
        with pd.HDFStore(hd_filename, mode='r') as f:
            keys = f.keys()
        if key in keys:
            df_start = df['mdEntryDateTime'].iloc[0].value
            end = pd.read_hdf(hd_filename, key=key, start=-2, stop=-1)['timestamps'].iloc[0]
            if not (df_start > end ):
                return False
    return True

def save_to_hdf(df, hd_filename, key):
    if not bounds_ok(df, hd_filename, key):
        raise IndexError("Overlapped index", df['mdEntryTime'].iloc[0])
    _df = pd.DataFrame({'timestamps': df['mdEntryDateTime'].astype(int),
                       'price': df['mdEntryPx']})
    _df.to_hdf(hd_filename, key=key, mode='a', append=True, format='table')
    
def save_partitions_to_hdf(dfs, hd_filename, key):
    for name, df in dfs.items():
        save_to_hdf(df, hd_filename, key+'/'+name)
    
def load_from_hdf(hd_filename, key):
    df = pd.read_hdf(hd_filename, key=key)
    df['timestamps'] = pd.to_datetime(df['timestamps'])
    return df
    
def cme_format_to_hdf(cme_filename, hd_filename, key):
    dfs = partition_df(load_df(cme_filename, DTYPES))
    save_partitions_to_hdf(dfs, hd_filename, key)

In [211]:
def convert_all_to_hdf(files, hd_filename, key):
    for file in tqdm(files):
        cme_format_to_hdf(file, hd_filename, key)
            

In [212]:
df = load_df(files[0], DTYPES)
dfs = partition_df(df)
print(dfs.keys())
dfs['BRTI']

dict_keys(['BRTI', 'BRR'])


Unnamed: 0,symbol,rptSeq,mdEntryType,mdEntryPx,mdUpdateAction,openCloseSettlFlag,netChgPrevDay,netPctChg,mdEntryCode,mdEntryDateTime
0,BRTI,16224820,INDEX_VALUE,1190.52,NEW,,,,,2017-04-07 00:00:00+00:00
1,BRTI,16224821,INDEX_VALUE,1190.52,NEW,,,,,2017-04-07 00:00:01+00:00
2,BRTI,16224822,INDEX_VALUE,1190.60,NEW,,,,,2017-04-07 00:00:02+00:00
3,BRTI,16224823,INDEX_VALUE,1190.53,NEW,,,,,2017-04-07 00:00:03+00:00
4,BRTI,16224824,INDEX_VALUE,1190.55,NEW,,,,,2017-04-07 00:00:04+00:00
...,...,...,...,...,...,...,...,...,...,...
86396,BRTI,16311215,INDEX_VALUE,1193.48,NEW,,,,,2017-04-07 23:59:55+00:00
86397,BRTI,16311216,INDEX_VALUE,1193.45,NEW,,,,,2017-04-07 23:59:56+00:00
86398,BRTI,16311217,INDEX_VALUE,1193.38,NEW,,,,,2017-04-07 23:59:57+00:00
86399,BRTI,16311218,INDEX_VALUE,1193.38,NEW,,,,,2017-04-07 23:59:58+00:00


In [213]:
ts = dfs['BRTI'].iloc[0].mdEntryDateTime
ts.value

1491523200000000000

In [214]:
savepath = Path("/media/hemu/Data/Markets/Crypto/CME/crypto.hdf5")

In [215]:
# Test single file
# cme_format_to_hdf(files[0], savepath, 'min')

In [216]:
files = list(path.iterdir())
files.sort()
done = 988

In [217]:
df.to_hdf()

TypeError: to_hdf() missing 2 required positional arguments: 'path_or_buf' and 'key'

In [221]:
convert_all_to_hdf(files[10:50], savepath, 'min')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40.0), HTML(value='')))




In [222]:
with pd.HDFStore(savepath, mode='r') as f:
    print(f.keys())

['/min/BRR', '/min/BRTI']


In [225]:
df = load_from_hdf(savepath, 'min/BRTI')
df.reset_index(drop=True)

Unnamed: 0,timestamps,price
0,2017-04-07 00:00:00,1190.52
1,2017-04-07 00:00:01,1190.52
2,2017-04-07 00:00:02,1190.60
3,2017-04-07 00:00:03,1190.53
4,2017-04-07 00:00:04,1190.55
...,...,...
4321029,2017-05-26 23:59:55,2250.84
4321030,2017-05-26 23:59:56,2250.84
4321031,2017-05-26 23:59:57,2251.87
4321032,2017-05-26 23:59:58,2251.60


In [62]:
base_url = 'https://datamine.cmegroup.com/cme/api/v1'
dataset='CRYPTOCURRENCY'
file = "20210109-CRYPTOCURRENCY"
url = f"{base_url}/list?dataset={dataset}"

# url = f"{base_url}/list?fid={file}"
user="API_HEMU_CRYPTO_2021"
password="Yamankalyan1"
data = {
#     "user": f"{user}:{password}"
    "user": user,
    "password": password
}
url = 'https://datamine.cmegroup.com/cme/api/v1/download?fid=20181203-CRYPTOCURRENCY'

In [63]:
resp = requests.post(url, data=data)

In [64]:
print(resp.text)
print(resp.json())

{"error":"unauthorized","error_description":"Full authentication is required to access this resource"}
{'error': 'unauthorized', 'error_description': 'Full authentication is required to access this resource'}
