In [None]:
#### Preamble ####
# Purpose: Extacts and downloads and saves the historical data of BTC/USDT
# Author: Jiazhou(Justin) Bi
# Date: 3 October 2024
# Contact: justin.bi@mail.utoronto.ca
# License: None
# Pre-requisites: see requirements.txt
# Any other information needed? None

# Claim

Currently, this data extraction extract historical data of BTC/USTD from binance on a 1 minute, 1 hour, and 1 day interval, starting 2017-08-17 04:00:00. It seems that binance can handle more calls from API, thus making our data extraction with high granularity (1m compared to 1h or 1d) effortlessly. Currently on Nov 14, 2024, the data retrieval process takes around 22 to 23 mins on my computer. Note that Binance only offers historical data of BTC/USDT started in 2017-08-17. For data with longer historical records, please use other exchanges' data. For more details, please check: https://docs.ccxt.com/#/.

In [None]:
# import necessary packages
import ccxt
import pandas as pd
from datetime import datetime

# Initialize the exchange
exchange = ccxt.binance() #binance, coinbase pro, kraken, bitfinex
symbol = 'BTC/USDT'
timeframe = '1m'      # Timeframe (e.g., '1m', '5m', '15m', '1h', '1d')
since = exchange.parse8601('2017-01-01T00:00:00Z') # Start date in ISO 8601 format
# Binance: Bitcoin historical data is available starting from 2017.
# Coinbase Pro: Data may go back to 2015 for BTC-USD, but it has limitations due to the recent deprecation of Coinbase Pro.
# Bitfinex: Bitcoin historical data goes back to around 2013.
# Kraken: Provides data from as early as 2011 for BTC-USD, though its granularity may vary for older data.

In [3]:
#retrieve data in batches
all_data = []
while since < exchange.milliseconds():
    data = exchange.fetch_ohlcv(symbol, timeframe, since=since, limit=500)
    if not data:
        break
    all_data.extend(data)
    since = data[-1][0] + 1

# this cell takes roughly 22 mins on my PC

In [4]:
df = pd.DataFrame(all_data, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
print(df.head())

            timestamp     open     high      low    close    volume
0 2017-08-17 04:00:00  4261.48  4261.48  4261.48  4261.48  1.775183
1 2017-08-17 04:01:00  4261.48  4261.48  4261.48  4261.48  0.000000
2 2017-08-17 04:02:00  4280.56  4280.56  4280.56  4280.56  0.261074
3 2017-08-17 04:03:00  4261.48  4261.48  4261.48  4261.48  0.012008
4 2017-08-17 04:04:00  4261.48  4261.48  4261.48  4261.48  0.140796


In [None]:
# saving the 1m timestamp data into parquet
df.to_parquet('../data/01-raw_data/raw_data_1m.parquet', index=False)
# (3804375, 6)

In [12]:
# using the same method to extract data on 1 hour timestamps (intervals)
timeframe = '1h'
since = exchange.parse8601('2017-01-01T00:00:00Z')

#retrieve data in batches
all_data = []
while since < exchange.milliseconds():
    data = exchange.fetch_ohlcv(symbol, timeframe, since=since, limit=500)
    if not data:
        break
    all_data.extend(data)
    since = data[-1][0] + 1

df = pd.DataFrame(all_data, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
print(df.head())

            timestamp     open     high      low    close     volume
0 2017-08-17 04:00:00  4261.48  4313.62  4261.32  4308.83  47.181009
1 2017-08-17 05:00:00  4308.83  4328.69  4291.37  4315.32  23.234916
2 2017-08-17 06:00:00  4330.29  4345.45  4309.37  4324.35   7.229691
3 2017-08-17 07:00:00  4316.62  4349.99  4287.41  4349.99   4.443249
4 2017-08-17 08:00:00  4333.32  4377.85  4333.32  4360.69   0.972807


In [None]:
# saving the 1h timestamp data into parquet
df.to_parquet('../data/01-raw_data/raw_data_1h.parquet', index=False)
# shape (63423, 6)

In [14]:
print(df.shape)

(63423, 6)


In [15]:
# using the same method to extract data on 1 day timestamps (intervals)
timeframe = '1d'
since = exchange.parse8601('2017-01-01T00:00:00Z')

#retrieve data in batches
all_data = []
while since < exchange.milliseconds():
    data = exchange.fetch_ohlcv(symbol, timeframe, since=since, limit=500)
    if not data:
        break
    all_data.extend(data)
    since = data[-1][0] + 1

df = pd.DataFrame(all_data, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
print(df.head())

   timestamp     open     high      low    close       volume
0 2017-08-17  4261.48  4485.39  4200.74  4285.08   795.150377
1 2017-08-18  4285.08  4371.52  3938.77  4108.37  1199.888264
2 2017-08-19  4108.37  4184.69  3850.00  4139.98   381.309763
3 2017-08-20  4120.98  4211.08  4032.62  4086.29   467.083022
4 2017-08-21  4069.13  4119.62  3911.79  4016.00   691.743060


In [17]:
# saving the 1d timestamp data into parquet
df.to_parquet('../data/01-raw_data/raw_data_1d.parquet', index=False)
# shape (2649, 6)