In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from selenium.common.exceptions import TimeoutException

from binance.client import Client
from pathlib import Path

import pandas as pd

from typing import List

import requests
import os
import zipfile

from datetime import datetime, timedelta, timezone

from secret import api_key, secret_key

In [6]:
url = "https://data.binance.vision/?prefix=data/spot/monthly/klines/BTCUSDT/1m/"
download_dir = "buffer"
# merge_dir = "binance-ticker"
# merge_dir = "binance-stablecoin"

yf_symbols = [
    # Currencies
    'EURUSD=X',  # Euro vs US Dollar
    'USDJPY=X',  # US Dollar vs Japanese Yen
    'GBPUSD=X',  # British Pound vs US Dollar
    'DXY-Y.NYB',      # US Dollar Index

    # Commodities
    'GC=F',  # Gold
    'SI=F',  # Silver
    'CL=F',  # Crude Oil
    'NG=F',  # Natural Gas

    # Major Indices
    '^GSPC',   # S&P 500
    '^DJI',    # Dow Jones Industrial Average
    '^IXIC',   # Nasdaq Composite
    '^RUT',    # Russell 2000
    '^FTSE',   # FTSE 100 (London)
    '^GDAXI',  # DAX (Germany)
    '^N225',   # Nikkei 225 (Japan)
    '^HSI',    # Hang Seng Index (Hong Kong)

    # Important Stocks
    'AAPL',    # Apple
    'MSFT',    # Microsoft
    'AMZN',    # Amazon
    'GOOGL',   # Alphabet (Google)
    'TSLA',    # Tesla
    'META',    # Meta Platforms (formerly Facebook)
    'BRK-B',   # Berkshire Hathaway
    'NVDA',    # NVIDIA
    'JPM',     # JPMorgan Chase
    'V',       # Visa
    'PG',      # Procter & Gamble
    'UNH',     # UnitedHealth Group
    'DIS',     # The Walt Disney Company
    'BABA',    # Alibaba Group
    'TSM',     # Taiwan Semiconductor Manufacturing Company
]

binance_tickers = [
    # Cryptocurrencies
    'BTCUSDT',  # Bitcoin vs USDT
    'ETHUSDT',  # Ethereum vs USDT
    'BNBUSDT',  # Binance Coin vs USDT
    'ADAUSDT',  # Cardano vs USDT
    'SOLUSDT',  # Solana vs USDT
    'XRPUSDT',  # XRP vs USDT
    'DOTUSDT',  # Polkadot vs USDT
    'DOGEUSDT', # Dogecoin vs USDT
    'AVAXUSDT', # Avalanche vs USDT
    'LINKUSDT', # Chainlink vs USDT
    'MATICUSDT',# Polygon vs USDT
    'LTCUSDT',  # Litecoin vs USDT
    'BCHUSDT',  # Bitcoin Cash vs USDT
    'ATOMUSDT', # Cosmos vs USDT
    'ALGOUSDT', # Algorand vs USDT
    'XLMUSDT',  # Stellar vs USDT
    'VETUSDT',  # VeChain vs USDT
    'ETCUSDT',  # Ethereum Classic vs USDT
    'THETAUSDT',# THETA vs USDT
    'XTZUSDT',  # Tezos vs USDT
]

binance_stablecoins = [
    # Stablecoins
    'BNBUSDT',  # Tether vs Binance Coin
    'BTCUSDT',  # Tether vs Bitcoin
    'ETHUSDT',  # Tether vs Ethereum
    'USDCUSDT', # USD Coin vs USDT
]


In [3]:
client = Client(api_key = api_key, api_secret = secret_key, tld = "com")

In [4]:
def get_binance_historical_klines_archive_links(symbol: str) -> List[str]:
    # Set up the WebDriver (using Chrome in this example)
    driver = webdriver.Chrome()

    # Open the URL
    url = f"https://data.binance.vision/?prefix=data/spot/monthly/klines/{symbol}/1m/"
    driver.get(url)

    # Wait for the first link to appear (with a timeout of 10 seconds)
    try:
        element_present = EC.presence_of_element_located((By.XPATH, '//tbody[@id="listing"]/tr'))
        WebDriverWait(driver, 10).until(element_present)
    except TimeoutException:
        print("Timed out waiting for page to load")
    finally:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find all the links that end with '.csv'
        zip_links = []
        for a in soup.find_all('a', href=True):
            href = a['href']
            if href.endswith('.zip'):
                zip_links.append(href)

        # Close the browser
        driver.quit()
    return zip_links


def download_archive_and_convert_to_csv(archive_link: str) -> None:
    file_name = archive_link.split("/")[-1]
    file_name_without_ext = Path(file_name).stem
    ticker_dir = os.path.join(download_dir, file_name_without_ext.split('-')[0])

    zip_path = os.path.join(ticker_dir, file_name)
    csv_path = os.path.join(ticker_dir, file_name_without_ext+".csv")
    print(f"Downloading {file_name}...")

    # Download the file
    response = requests.get(archive_link)
    response.raise_for_status()  # Ensure we notice bad responses

    if not os.path.exists(ticker_dir):
        os.makedirs(ticker_dir)
    
    # Save the ZIP file to the specified directory
    with open(zip_path, 'wb') as file:
        file.write(response.content)
    print(f"{file_name} downloaded and saved to {zip_path}")

    # Extract the ZIP file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(ticker_dir)
    print(f"{file_name} extracted to {csv_path}")

    # Delete the ZIP file
    os.remove(zip_path)
    print(f"{file_name} deleted from {zip_path}")


def merge_into_single_csv(csv_directory: str, output_directory: str) -> None:
    # Define the directory containing CSV files and the output file name
    symbol_name = Path(csv_directory).stem
    output_file = os.path.join(output_directory, symbol_name+".csv") 

    # List all CSV files in the directory
    csv_files = [f for f in os.listdir(csv_directory) if f.endswith('.csv')]

    # Initialize an empty list to hold DataFrames
    dataframes = []

    # Iterate through the list of CSV files and read each file into a DataFrame
    for csv_file in csv_files:
        file_path = os.path.join(csv_directory, csv_file)
        print(f"Reading {file_path}...")
        df = pd.read_csv(file_path)
        df.columns = ["Open Time", "Open", "High", "Low", "Close",
                "Volume", "Close Time", "Quote Asset Volume", "Number of Trades", "Taker Buy Base Asset Volume",
                "Taker Buy Quote Asset Volume", "Ignore"]
        # Convert 'Open Time' to datetime
        df["Date"] = pd.to_datetime(df["Open Time"], unit="ms")
        df.drop(columns=["Open Time"], inplace=True)

        # Set 'Date' as the index
        df.set_index("Date", inplace=True)
        dataframes.append(df)

    # Concatenate all DataFrames into a single DataFrame
    merged_df = pd.concat(dataframes, axis=0)

    # Select and reorder columns
    merged_df = merged_df[["High", "Low", "Close", "Volume"]].copy()

    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv(output_file)
    print(f"Merged CSV saved to {output_file}")


def get_current_month_days_count():
    # Get the current date
    current_date = datetime.now()

    # Get the first day of the current month
    first_day_of_month = current_date.replace(day=1)

    # Calculate the timedelta from the first day of the month to the current date
    delta = current_date - first_day_of_month

    # Extract the number of days from the timedelta object
    days_in_current_month = delta.days + 1  # +1 to include the current day
    return days_in_current_month


def store_binance_ticker(ticker_name: str, interval: str, days: str) -> None:
    now = datetime.now(timezone.utc)
    past = str(now - timedelta(days = days))

    bars = client.get_historical_klines(symbol = ticker_name, interval = interval, start_str = past, end_str = None, limit = 1000)
    df = pd.DataFrame(bars)
    print(df.columns)
    # df["Date"] = pd.to_datetime(df.iloc[:,0], unit = "ms")
    df.columns = ["Open Time", "Open", "High", "Low", "Close",
                "Volume", "Close Time", "Quote Asset Volume", "Number of Trades", "Taker Buy Base Asset Volume",
                "Taker Buy Quote Asset Volume", "Ignore"]
    # df.drop(columns=["Open"], inplace=True)
    df.set_index("Open Time", inplace = True)
    df.to_csv(os.path.join(download_dir, ticker_name, ticker_name+"_current_month.csv"), index=True)

In [8]:
# Download and extract each ZIP file
for symbols, directory in zip([binance_stablecoins, binance_tickers], ["binance-stablecoin", "binance-ticker"]):
    for symbol in symbols:
        # Get list of links to Binance historical kline data packed in ZIP archive
        kline_archive_links: List[str] = get_binance_historical_klines_archive_links(symbol)

        # # Download every ZIP archive and convert to CSV
        for link in kline_archive_links:
            download_archive_and_convert_to_csv(link)

        # Get recent month ticker data and store into CSV
        store_binance_ticker(symbol, interval="1m", days=get_current_month_days_count())

        #Merge all together
        merge_into_single_csv(os.path.join(download_dir, symbol), os.path.join("data", directory))

Reading buffer\BNBUSDT\BNBUSDT-1m-2017-11.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2017-12.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2018-01.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2018-02.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2018-03.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2018-04.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2018-05.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2018-06.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2018-07.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2018-08.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2018-09.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2018-10.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2018-11.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2018-12.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2019-01.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2019-02.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2019-03.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2019-04.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2019-05.csv...
Reading buffer\BNBUSDT\BNBUSDT-1m-2019-06.csv...
Reading buffer\BNBUS

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'buffer\\ADAUSDT'