# Imports

In [48]:
import requests
import os

import pandas as pd
import pyarrow as pa

from datetime import datetime, timedelta

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

# Collect from yahoo

In [None]:
def _download_data(stock_ticker: str, time_from: str, time_to: str):
    data_dir = "data_yahoo_stock/" # Relative path to here
    parquet_filename = f"{data_dir}/ticker={stock_ticker}.parquet"
    temp_csv_filename = f"./temp.csv"

    if os.path.exists(parquet_filename):
        print(f"Data for [ {stock_ticker} ] exists, skipping")
    else:
        print(f"Attempting to download [ {stock_ticker} ] data")
        url = f"https://query1.finance.yahoo.com/v7/finance/download/{stock_ticker}?period1={time_from}&period2={time_to}&interval=1d&events=history&includeAdjustedClose=false"
        req = requests.get(url)
        url_content = req.content

        csv_file = open(temp_csv_filename, "wb")
        csv_file.write(url_content)
        csv_file.close()

        df =pd.read_csv(temp_csv_filename)
        df.to_parquet(parquet_filename)
    
    os.remove(temp_csv_filename)

In [None]:
stocks_we_want = [
    "AAPL",
    "MSFT",
    "AMZN",
    "GOOG",
    "GOOGL",
    "FB",
    "BABA",
    "TSLA",
    "TSM",
    "JPM",
    "V",
    "JNJ",
    "WMT",
    "MA",
    "UNH",
    "DIS",
    "PG",
    "BAC",
    "HD",
    "NVDA",
    "PYPL",
    "CMCSA",
    "INTC",
    "ASML",
    "XOM",
    "VZ",
    "NFLX",
    "KO",
    "ADBE",
    "T",
    "TM",
    "NKE",
    "CSCO",
    "ABT",
    "CVX",
    "PFE",
    "NVS",
    "CRM",
    "MRK",
    "ORCL",
    "PEP",
    "AVGO",
    "ABBV",
    "ACN",
    "TMO",
    "LLY",
    "MCD",
    "TXN",
    "WFC",
    "MDT",
    "DHR",
    "TMUS",
    "COST",
    "HON",
    "C",
    "UL",
    "QCOM",
    "UNP",
    "HDB",
    "SAP",
    "NEE",
    "AMGN",
    "LIN",
    "BA",
    "BMY",
    "UPS",
    "PM",
    "SHOP",
    "RY",
    "LOW",
    "AZN",
    "SNE",
    "CHTR",
    "MS",
    "SBUX",
    "SNY",
    "TOT",
    "CAT",
    "NVO",
    "TD",
    "HSBC",
    "IBM",
    "SCHW",
    "RTX",
    "DE",
    "GS",
    "AXP",
    "AMAT",
    "MMM",
    "BLK",
    "GE",
    "PDD",
    "ABNB",
    "JD",
    "EL",
    "INTU",
    "BUD",
    "AMT",
]

In [None]:
time_to = str(int(datetime(2021,3,30,0,0).timestamp()))
time_from = str(int((datetime(2021,3,30,0,0) - timedelta(days=365)).timestamp()))

In [None]:
for stock_ticker in stocks_we_want:
    _download_data(stock_ticker=stock_ticker)

# Collect from binance

In [3]:
from api_utilities import BinanceHelper
from binance import Client, ThreadedWebsocketManager, ThreadedDepthCacheManager

In [5]:
binance_helper = BinanceHelper()

client = Client(binance_helper.api_key, binance_helper.secret_key)

## Playing with the API

### get market depth

In [6]:
client.get_order_book(symbol='BNBBTC')

{'lastUpdateId': 2174659151,
 'bids': [['0.00984100', '16.21600000'],
  ['0.00983900', '7.68500000'],
  ['0.00983800', '5.41800000'],
  ['0.00983700', '5.19900000'],
  ['0.00983600', '87.26500000'],
  ['0.00983500', '116.62600000'],
  ['0.00983400', '29.12500000'],
  ['0.00983300', '9.61700000'],
  ['0.00983200', '5.44600000'],
  ['0.00983100', '20.00000000'],
  ['0.00983000', '5.23300000'],
  ['0.00982900', '33.06400000'],
  ['0.00982800', '20.94800000'],
  ['0.00982700', '0.04100000'],
  ['0.00982600', '8.61000000'],
  ['0.00982500', '7.10100000'],
  ['0.00982400', '1.05500000'],
  ['0.00982300', '18.49100000'],
  ['0.00982200', '1.92000000'],
  ['0.00982100', '0.15800000'],
  ['0.00982000', '3.66300000'],
  ['0.00981900', '16.82900000'],
  ['0.00981800', '114.94300000'],
  ['0.00981700', '2.84600000'],
  ['0.00981600', '75.10000000'],
  ['0.00981500', '3.34800000'],
  ['0.00981400', '8.65100000'],
  ['0.00981300', '25.66500000'],
  ['0.00981200', '2.64200000'],
  ['0.00981100', '1.0

### get all symbol prices

In [7]:
client.get_all_tickers()

[{'symbol': 'ETHBTC', 'price': '0.07719700'},
 {'symbol': 'LTCBTC', 'price': '0.00375900'},
 {'symbol': 'BNBBTC', 'price': '0.00984400'},
 {'symbol': 'NEOBTC', 'price': '0.00112800'},
 {'symbol': 'QTUMETH', 'price': '0.00348800'},
 {'symbol': 'EOSETH', 'price': '0.00138300'},
 {'symbol': 'SNTETH', 'price': '0.00002691'},
 {'symbol': 'BNTETH', 'price': '0.00119100'},
 {'symbol': 'BCCBTC', 'price': '0.07908100'},
 {'symbol': 'GASBTC', 'price': '0.00021390'},
 {'symbol': 'BNBETH', 'price': '0.12750000'},
 {'symbol': 'BTCUSDT', 'price': '49554.93000000'},
 {'symbol': 'ETHUSDT', 'price': '3824.96000000'},
 {'symbol': 'HSRBTC', 'price': '0.00041400'},
 {'symbol': 'OAXETH', 'price': '0.00017780'},
 {'symbol': 'DNTETH', 'price': '0.00002801'},
 {'symbol': 'MCOETH', 'price': '0.00577200'},
 {'symbol': 'ICNETH', 'price': '0.00166300'},
 {'symbol': 'MCOBTC', 'price': '0.00021140'},
 {'symbol': 'WTCBTC', 'price': '0.00001935'},
 {'symbol': 'WTCETH', 'price': '0.00023700'},
 {'symbol': 'LRCBTC', 'p

### get historical kline data from any date range

In [17]:
# fetch 1 minute klines for the last day up until now
klines = client.get_historical_klines("BNBBTC", Client.KLINE_INTERVAL_1MINUTE, "2 day ago UTC")
print(len(klines))

2880


In [10]:
# fetch 30 minute klines for the last month of 2017
klines = client.get_historical_klines("ETHBTC", Client.KLINE_INTERVAL_30MINUTE, "1 Dec, 2017", "1 Jan, 2018")
klines

[[1512086400000,
  '0.04368400',
  '0.04375100',
  '0.04334200',
  '0.04366500',
  '2081.85600000',
  1512088199999,
  '90.79655078',
  3904,
  '976.19100000',
  '42.59074736',
  '271480.34213668'],
 [1512088200000,
  '0.04360200',
  '0.04369900',
  '0.04325100',
  '0.04350100',
  '2420.48100000',
  1512089999999,
  '105.27683806',
  2775,
  '1133.24800000',
  '49.31486895',
  '271300.32546398'],
 [1512090000000,
  '0.04350100',
  '0.04379400',
  '0.04304900',
  '0.04370500',
  '2192.51500000',
  1512091799999,
  '95.48824264',
  2359,
  '1029.30200000',
  '44.86783356',
  '271143.81327337'],
 [1512091800000,
  '0.04374900',
  '0.04392000',
  '0.04361300',
  '0.04378100',
  '1482.24800000',
  1512093599999,
  '64.86877196',
  2112,
  '704.81600000',
  '30.85380066',
  '271394.84389969'],
 [1512093600000,
  '0.04375800',
  '0.04424900',
  '0.04364800',
  '0.04403800',
  '2073.49800000',
  1512095399999,
  '90.95341447',
  2763,
  '996.44700000',
  '43.72006243',
  '271126.63134592'],
 [

In [11]:
# fetch weekly klines since it listed
klines = client.get_historical_klines("NEOBTC", Client.KLINE_INTERVAL_1WEEK, "1 Jan, 2017")
klines

[[1499644800000,
  '0.00375000',
  '0.00375000',
  '0.00235700',
  '0.00263800',
  '473736.36000000',
  1500249599999,
  '1196.77166665',
  8973,
  '286442.84000000',
  '723.69695083',
  '218593.00000000'],
 [1500249600000,
  '0.00263800',
  '0.00345800',
  '0.00255700',
  '0.00326100',
  '1594660.22000000',
  1500854399999,
  '4589.96992084',
  31760,
  '1000499.53000000',
  '2879.09366021',
  '359801.00000000'],
 [1500854400000,
  '0.00326600',
  '0.00326900',
  '0.00240100',
  '0.00261200',
  '3971754.21000000',
  1501459199999,
  '11114.83093982',
  51384,
  '2219779.02000000',
  '6233.71564475',
  '533884.00000000'],
 [1501459200000,
  '0.00261100',
  '0.00498400',
  '0.00249700',
  '0.00482100',
  '4986580.16000000',
  1502063999999,
  '17488.29338244',
  69886,
  '2569264.33000000',
  '9026.54218521',
  '630085.00000000'],
 [1502064000000,
  '0.00481900',
  '0.01381600',
  '0.00440800',
  '0.01170400',
  '8971365.12000000',
  1502668799999,
  '70289.94990929',
  149558,
  '45833

## Actual data collection

In [61]:
def _download_crypto_data(symbol: str, date_obj: datetime):
    truth_path_to_here = os.path.dirname(os.path.realpath('data_binance_crypto/'))
    data_dir = f"{truth_path_to_here}/data_binance_crypto/symbol={symbol}/day={date_obj.date()}"
    parquet_filename = f"{data_dir}/data.parquet"

    if os.path.exists(parquet_filename):
        print(f"Data for [ {symbol} - {date_obj.date()} ] exists, skipping")
    else:
        os.makedirs(data_dir)
        print(f"Attempting to download [ {symbol} - {date_obj.date()} ] data")
        
        klines = client.get_historical_klines(
            symbol=symbol, 
            interval=Client.KLINE_INTERVAL_1MINUTE,
            start_str=str(int(date_obj.timestamp())),
            end_str=str(int((date_obj + timedelta(days=1)).timestamp())),
            
        )
    
        df = pd.DataFrame.from_records(
            data=klines,
            columns=[
                "open_time",
                "open",
                "high",
                "low",
                "close",
                "volume",
                "close_time",
                "quote_asset_vol",
                "num_trades",
                "taker_buy_base_asset_vol",
                "taker_buy_quote_asset_vol",
                "ignore_this"
            ]
        )

        df.to_parquet(parquet_filename)

In [65]:
symbol = "BTCUSDT"

start_date = "2021-01-01"
end_date = "2021-01-01"

date_obj = datetime.strptime(start_date, "%Y-%m-%d")

while str(date_obj.date()) != end_date:
    _download_crypto_data(symbol=symbol, date_obj=date_obj)
    date_obj += timedelta(days=1)

Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ BTCUSDT ] exists, skipping
Data for [ B

FileExistsError: [Errno 17] File exists: '/home/tomm/Desktop/Projects/Quant-exploration/data_binance_crypto/symbol=BTCUSDT/day=2022-12-08'

In [77]:
df = pd.read_parquet("data_binance_crypto/", engine="pyarrow")
df

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_vol,num_trades,taker_buy_base_asset_vol,taker_buy_quote_asset_vol,ignore_this,symbol,day
0,1609459200000,28923.63000000,28961.66000000,28913.12000000,28961.66000000,27.45703200,1609459259999,794382.04398665,1292,16.77719500,485390.82682460,0,BTCUSDT,2021-01-01
1,1609459260000,28961.67000000,29017.50000000,28961.01000000,29009.91000000,58.47750100,1609459319999,1695802.89696884,1651,33.73381800,978176.46820208,0,BTCUSDT,2021-01-01
2,1609459320000,29009.54000000,29016.71000000,28973.58000000,28989.30000000,42.47032900,1609459379999,1231358.69059884,986,13.24744400,384076.85445305,0,BTCUSDT,2021-01-01
3,1609459380000,28989.68000000,28999.85000000,28972.33000000,28982.69000000,30.36067700,1609459439999,880016.76348383,959,9.45602800,274083.07514154,0,BTCUSDT,2021-01-01
4,1609459440000,28982.67000000,28995.93000000,28971.80000000,28975.65000000,24.12433900,1609459499999,699226.20560386,726,6.81464400,197519.37488805,0,BTCUSDT,2021-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349285,1630454160000,47183.19000000,47186.35000000,47157.89000000,47173.84000000,15.11035000,1630454219999,712854.74467820,550,4.74441000,223804.23984810,0,BTCUSDT,2021-08-31
349286,1630454220000,47173.84000000,47176.25000000,47119.92000000,47141.60000000,17.57768000,1630454279999,828732.93736670,516,6.13219000,289198.75703480,0,BTCUSDT,2021-08-31
349287,1630454280000,47141.59000000,47141.60000000,47100.03000000,47130.89000000,25.39038000,1630454339999,1196233.82688950,672,12.62499000,594835.65456950,0,BTCUSDT,2021-08-31
349288,1630454340000,47130.89000000,47130.89000000,47100.89000000,47100.89000000,22.50441000,1630454399999,1060434.02449820,437,10.49114000,494374.67997310,0,BTCUSDT,2021-08-31


# Placeholder header