In [2]:
import glob
import os
import zipfile

import pandas as pd

In [3]:
folders = glob.glob("L:/data/fx/HistData/*")

In [None]:
for folder in folders:
    files = glob.glob(f"{folder}/*.zip")
    if len(files) == 0:
        continue

    symbol = os.path.basename(folder)
    frame = "M1"
    os.makedirs(f"../../Data/HistData/", exist_ok=True)

    existing_files = glob.glob(f"../../Data/HistData/{symbol}*.csv")
    if len(existing_files) > 0:
        continue

    df = None
    for filename in files:
        with zipfile.ZipFile(filename) as zf:
            zip_files = zf.namelist()
            for zip_file in zip_files:
                if zip_file.endswith(".csv"):
                    with zf.open(zip_file) as f:
                        temp_df = pd.read_csv(f, sep=";", index_col=0, header=None, parse_dates=[0],  date_format='%Y%m%d %H%M%S')
                    if len(temp_df.columns) == 0:
                        with zf.open(zip_file) as f:
                            temp_df = pd.read_csv(f, header=None)
                            ohlc_columns = temp_df.columns[2:]
                            datetime_index = pd.to_datetime(temp_df[0] + ' ' + temp_df[1], format='%Y.%m.%d %H:%M')
                            temp_df = temp_df[ohlc_columns].copy()
                            temp_df.index = datetime_index
                    if df is None:
                        df = temp_df
                    else:
                        df = pd.concat([df, temp_df], axis=0)
    
    start_year = df.index[0].year
    end_year = df.index[-1].year
    csv_path = os.path.abspath(f"../../Data/HistData/{symbol}_{start_year}-{end_year}_{frame}.csv")
    df.to_csv(csv_path)

In [1]:
from lzma import LZMADecompressor, LZMAError, FORMAT_AUTO
import optparse
import os
import struct
from datetime import datetime, timedelta
from urllib import request
from urllib.error import HTTPError

# 3rd party modules
import pandas as pd


def decompress_lzma(data):
    results = []
    len(data)
    while True:
        decomp = LZMADecompressor(FORMAT_AUTO, None, None)
        try:
            res = decomp.decompress(data)
        except LZMAError:
            if results:
                break
            else:
                raise
        results.append(res)
        data = decomp.unused_data
        if not data:
            break
        if not decomp.eof:
            raise LZMAError("Compressed data ended before the end-of-stream marker was reached")
    return b"".join(results)


def tokenize(buffer):
    token_size = 20
    token_count = int(len(buffer) / token_size)
    tokens = list(map(lambda x: struct.unpack_from('>3I2f', buffer, token_size * x), range(0, token_count)))
    return tokens


def normalize_tick(symbol, day, time, ask, bid, ask_vol, bid_vol):
    date = day + timedelta(milliseconds=time)

    # TODO 網羅する。この通過ペア以外も有るかも
    if any(map(lambda x: x in symbol.lower(), ['usdrub', 'xagusd', 'xauusd', 'jpy'])):
        point = 1000
    else:
        point = 100000

    return [date, ask/point, bid/point, round(ask_vol * 1000000), round(bid_vol * 1000000)]


def download_ticks(symbol, day):
    url_prefix='https://datafeed.dukascopy.com/datafeed'

    ticks_day = []
    for h in range(0, 24):
        file_name = f'{h:02d}h_ticks.bi5'
        url = f'{url_prefix}/{symbol}/{day.year:04d}/{day.month-1:02d}/{day.day:02d}/{file_name}'
        print(f'downloading: {url}')

        req = request.Request(url)
        try:
            with request.urlopen(req) as res:
                res_body = res.read()
        except HTTPError:
            print('download failed. continuing..')
            continue

        if len(res_body):
            try:
                data = decompress_lzma(res_body)
            except LZMAError:
                print('decompress failed. continuing..')
                continue
        else:
            data = []

        tokenized_data = tokenize(data)
        ticks_hour = list(map(lambda x: normalize_tick(symbol, day + timedelta(hours=h), *x), tokenized_data))
        ticks_day.extend(ticks_hour)

    return ticks_day


def format_to_csv_for_ticks(ticks):
    return '\n'.join(map(lambda x: '{},{},{},{},{}'.format(x[0].strftime('%Y-%m-%d %H:%M:%S.%f'), *x[1:]), ticks))+'\n'


def format_to_csv_for_candle(ticks, scale):
    df = pd.DataFrame(ticks, columns=['Date', 'Ask', 'Bid', 'AskVolume', 'BidVolume'])
    df = df.drop(['Ask', 'AskVolume', 'BidVolume'], axis=1)
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)

    df_ohlc = df.resample(scale).ohlc()
    df_ohlcv = df_ohlc.assign(Volume=df.iloc[:, 0].resample(scale).count())

    csv_str = df_ohlcv.to_csv(header=False, date_format = '%Y-%m-%d %H:%M:%S')

    return csv_str

In [None]:
import os

symbol = "USTBONDTRUSD"
start_date = datetime(2007, 7, 19)
end_date = datetime(2024, 12, 1)
output_csv = os.path.join("Data", f'{symbol}_{start_date.strftime("%Y-%m-%d")}_{end_date.strftime("%Y-%m-%d")}.csv')

d = start_date
with open(output_csv, 'w') as f:
    while d <= end_date:
        ticks_day = download_ticks(symbol, d)
        f.write(format_to_csv_for_ticks(ticks_day))
        d += timedelta(days=1)

downloading: https://datafeed.dukascopy.com/datafeed/USTBONDTRUSD/2000/00/01/00h_ticks.bi5
downloading: https://datafeed.dukascopy.com/datafeed/USTBONDTRUSD/2000/00/01/01h_ticks.bi5
download failed. continuing..
downloading: https://datafeed.dukascopy.com/datafeed/USTBONDTRUSD/2000/00/01/02h_ticks.bi5
downloading: https://datafeed.dukascopy.com/datafeed/USTBONDTRUSD/2000/00/01/03h_ticks.bi5
downloading: https://datafeed.dukascopy.com/datafeed/USTBONDTRUSD/2000/00/01/04h_ticks.bi5
downloading: https://datafeed.dukascopy.com/datafeed/USTBONDTRUSD/2000/00/01/05h_ticks.bi5
download failed. continuing..
downloading: https://datafeed.dukascopy.com/datafeed/USTBONDTRUSD/2000/00/01/06h_ticks.bi5
downloading: https://datafeed.dukascopy.com/datafeed/USTBONDTRUSD/2000/00/01/07h_ticks.bi5
downloading: https://datafeed.dukascopy.com/datafeed/USTBONDTRUSD/2000/00/01/08h_ticks.bi5
downloading: https://datafeed.dukascopy.com/datafeed/USTBONDTRUSD/2000/00/01/09h_ticks.bi5
downloading: https://datafeed.

URLError: <urlopen error [WinError 10060] 接続済みの呼び出し先が一定の時間を過ぎても正しく応答しなかったため、接続できませんでした。または接続済みのホストが応答しなかったため、確立された接続は失敗しました。>

In [None]:
for symbol in ["UKGILTTRGBP", "BUNDTREUR"]:
    start_date = datetime(2000, 1, 1)
    end_date = datetime(2024, 12, 1)
    output_csv = os.path.join("Data", f'{symbol}_{start_date.strftime("%Y-%m-%d")}_{end_date.strftime("%Y-%m-%d")}.csv')

    d = start_date
    with open(output_csv, 'w') as f:
        while d <= end_date:
            ticks_day = download_ticks(symbol, d)
            f.write(format_to_csv_for_ticks(ticks_day))
            d += timedelta(days=1)