In [1]:
# Created by LP
# Date: 2024-11-27
# Trade only the money you can't afford to lose
# Then go back to the mine
# And try again.
# This was coded with love <3


<h1 style="color: #2c3e50;">Data Collection Bot</h1>

<p style="font-size: 16px; line-height: 1.6;">
    The <strong>Data Collection Bot</strong> is a Python-based tool designed for traders to continuously 
    fetch and store market data from exchanges. It ensures precision and consistency by aligning data collection 
    with specific timeframe boundaries, making it an essential tool for systematic trading strategies.
</p>

<h2 style="color: #34495e;">Key Features:</h2>
<ul style="font-size: 16px; line-height: 1.6;">
    <li>Automatically collects OHLCV (Open, High, Low, Close, Volume) data for any trading pair.</li>
    <li>Aligns data collection with exact timeframe boundaries (e.g., 1-minute, 15-minute intervals).</li>
    <li>Customizable options for saving data, including folder paths and additional filename identifiers.</li>
    <li>Handles errors gracefully, ensuring continuous operation without data loss.</li>
    <li>Supports asynchronous saving to minimize delays in data fetching.</li>
    <li>THE BOT FOCUSES THE FETCHING AROUND THE TIMEFRAME CHANGE AND THEN IS IDLE, good bot.</li>
</ul>

<h2 style="color: #34495e;">Why Is Data Collection Important for Traders?</h2>
<p style="font-size: 16px; line-height: 1.6;">
    For traders, having access to accurate and comprehensive historical market data is critical for building 
    robust trading strategies. Here’s why collecting data with a tool like <strong>Data Collection Bot</strong> is essential:
</p>
<ul style="font-size: 16px; line-height: 1.6;">
    <li><strong>Backtesting Strategies:</strong> Reliable historical data allows traders to test their strategies 
        before deploying them in live markets, reducing risk.</li>
    <li><strong>Real-Time Insights:</strong> Continuously updated data enables traders to respond promptly 
        to market movements and make informed decisions.</li>
    <li><strong>Machine Learning Models:</strong> Data is the foundation for training predictive models 
        and improving algorithmic trading systems.</li>
    <li><strong>Risk Management:</strong> Accurate data helps traders assess volatility, liquidity, and 
        other key market metrics.</li>
    <li><strong>Market Analysis:</strong> A clean dataset allows for detailed analysis, enabling traders 
        to identify patterns, trends, and anomalies.</li>
</ul>

<h2 style="color: #34495e;">Example Use Case:</h2>
<p style="font-size: 16px; line-height: 1.6;">
    A trader interested in 1-minute price movements of <strong>BTC/USDC</strong> uses the bot to collect 
    data continuously. The bot saves the data in a structured format, allowing the trader to analyze trends 
    or train a machine learning model to predict price movements.
</p>

<h2 style="color: #34495e;">How to Use:</h2>
<ol style="font-size: 16px; line-height: 1.6;">
    <li>Specify the trading pair, timeframe, and exchange in the bot configuration.</li>
    <li>Optionally customize the save folder and filename for organizing your datasets.</li>
    <li>Run the bot in a Jupyter Notebook or Python script to start collecting data.</li>
    <li>Access your datasets in CSV format for analysis, backtesting, or training models.</li>
</ol>

<p style="font-size: 16px; line-height: 1.6; color: #2c3e50;">
    With <strong>Data Collection Bot</strong>, traders can focus on strategy development while the bot handles 
    the time-consuming task of data gathering. This ensures a streamlined and efficient workflow, empowering 
    traders to stay ahead in dynamic markets.
</p>



In [2]:
# setup

import ccxt
hyperliquid = ccxt.hyperliquid()
#hyperliquid.set_sandbox_mode(True)  # enable sandbox mode
print(hyperliquid)

Hyperliquid


In [3]:
hyperliquid.timeframes

{'1m': '1m',
 '3m': '3m',
 '5m': '5m',
 '15m': '15m',
 '30m': '30m',
 '1h': '1h',
 '2h': '2h',
 '4h': '4h',
 '8h': '8h',
 '12h': '12h',
 '1d': '1d',
 '3d': '3d',
 '1w': '1w',
 '1M': '1M'}

In [4]:
#my testnet account
params = {"user": "0x2038B73AE49F18EC2Faa0fC9E56e6c898d6C51b9", "type":"swap"} #not my address lmao
hyperliquid.fetchBalance(params)

{'info': {'marginSummary': {'accountValue': '0.0',
   'totalNtlPos': '0.0',
   'totalRawUsd': '0.0',
   'totalMarginUsed': '0.0'},
  'crossMarginSummary': {'accountValue': '0.0',
   'totalNtlPos': '0.0',
   'totalRawUsd': '0.0',
   'totalMarginUsed': '0.0'},
  'crossMaintenanceMarginUsed': '0.0',
  'withdrawable': '0.0',
  'assetPositions': [],
  'time': '1732713993741'},
 'USDC': {'total': 0.0, 'free': 0.0, 'used': 0.0},
 'timestamp': 1732713993741,
 'datetime': '2024-11-27T13:26:33.741Z',
 'free': {'USDC': 0.0},
 'used': {'USDC': 0.0},
 'total': {'USDC': 0.0}}

In [5]:
symbol = "BTC/USDC:USDC"
order_book = hyperliquid.fetchOrderBook(symbol)
len(order_book['asks'])
# it appears that the orderbook is capped at the top 20

20

In [6]:
timeframe = '1m'
ohlcv = hyperliquid.fetchOHLCV(symbol, timeframe, limit = 5000)
len(ohlcv)

5000

In [7]:
ohlcv[0]

[1732414020000, 98449.0, 98476.0, 98445.0, 98476.0, 2.07054]

In [8]:
from datetime import datetime, timezone
import pandas as pd
import time
# Define the date in July 2024
date = datetime(2023, 9, 1, tzinfo=timezone.utc)

# Get the UTC timestamp in seconds and convert to milliseconds
timestamp_ms = int(date.timestamp() * 1000)

#print(f"UTC Timestamp for Sep 1, 2024: {timestamp_ms}")


def fetch_ohlcv(symbol, timeframe, exchange, timestamp_ms = None, limit = 5):
    current_timestamp_ms = int(time.time() * 1000) - 360000
    ohlcv = exchange.fetchOHLCV(symbol, timeframe, since = current_timestamp_ms, limit = limit)
    df = pd.DataFrame(ohlcv, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    df.set_index('timestamp', inplace=True)
    return pd.DataFrame(df)

data = fetch_ohlcv(symbol, timeframe, hyperliquid)
data



Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-11-27 13:21:00,93570.0,93579.0,93534.0,93543.0,1.30422
2024-11-27 13:22:00,93571.0,93655.0,93571.0,93654.0,1.03909
2024-11-27 13:23:00,93653.0,93653.0,93606.0,93606.0,0.05615
2024-11-27 13:24:00,93636.0,93646.0,93605.0,93614.0,0.00963
2024-11-27 13:25:00,93597.0,93673.0,93593.0,93663.0,0.99439


# My good bot

In [13]:
import pandas as pd
import time
import os
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor


class DataCollectionBot:
    def __init__(self, symbol, timeframe, exchange, dataset_size=5000, save_folder="data-collection", name_field=""):
        """
        Initialize the Data Collection Bot.
        :param symbol: Trading pair (e.g., "BTC/USDC:USDC").
        :param timeframe: Timeframe (e.g., '1m', '15m', '1h').
        :param exchange: Exchange object.
        :param dataset_size: Initial number of rows to fetch.
        :param save_folder: Folder to save the collected data.
        :param name_field: Additional string to include in the filename.
        """
        print('Hi, I am a good bot for creating crypto ohlcv datasets from hyperliquid ;) made by Leonardo')
        self.symbol = symbol
        self.timeframe = timeframe
        self.exchange = exchange
        self.dataset_size = dataset_size
        self.save_folder = save_folder
        self.name_field = f"-{name_field}" if name_field else ""
        self.filename = None
        self.last_timestamp = None
        self.data = None

        # Ensure save folder exists
        os.makedirs(self.save_folder, exist_ok=True)

    def fetch_ohlcv(self, since=None, limit=5000):
        """
        Fetch consistent OHLCV data from the exchange.
        """
        ohlcv = self.exchange.fetchOHLCV(self.symbol, self.timeframe, since=since, limit=limit)
        if not ohlcv:
            print("No data received. Retrying...")
            return pd.DataFrame()

        # Convert to DataFrame
        df = pd.DataFrame(ohlcv, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
        df.set_index('timestamp', inplace=True)
        return df

    def calculate_next_interval(self, last_timestamp):
        """
        Calculate the next interval based on the last fetched timestamp and timeframe.
        """
        timeframe_to_delta = {
            '1m': timedelta(minutes=1),
            '5m': timedelta(minutes=5),
            '15m': timedelta(minutes=15),
            '1h': timedelta(hours=1),
            '4h': timedelta(hours=4),
            '1d': timedelta(days=1),
        }
        if self.timeframe not in timeframe_to_delta:
            raise ValueError(f"Unsupported timeframe: {self.timeframe}")

        delta = timeframe_to_delta[self.timeframe]
        aligned_timestamp = last_timestamp.replace(second=0, microsecond=0)
        return aligned_timestamp + delta

    def save_data_to_csv(self, data):
        """
        Save or append data to the CSV file.
        """
        if not os.path.exists(self.filename):
            data.to_csv(self.filename, mode='w', header=True)
        else:
            data.to_csv(self.filename, mode='a', header=False)

    def calculate_since(self):
        """
        Calculate the `since` timestamp to fetch the initial dataset.
        """
        timeframe_to_ms = {
            '1m': 60 * 1000,
            '5m': 5 * 60 * 1000,
            '15m': 15 * 60 * 1000,
            '1h': 60 * 60 * 1000,
            '4h': 4 * 60 * 60 * 1000,
            '1d': 24 * 60 * 60 * 1000,
        }
        if self.timeframe not in timeframe_to_ms:
            raise ValueError(f"Unsupported timeframe: {self.timeframe}")

        timeframe_ms = timeframe_to_ms[self.timeframe]
        current_timestamp_ms = int(time.time() * 1000)
        aligned_timestamp_ms = current_timestamp_ms - (current_timestamp_ms % timeframe_ms)
        return aligned_timestamp_ms - (timeframe_ms * self.dataset_size)

    def run(self):
        """
        Run the data collection bot.
        """
        # Initial fetch
        since = self.calculate_since()
        self.data = self.fetch_ohlcv(since=since, limit=self.dataset_size)

        if self.data.empty:
            print("Failed to fetch initial dataset. Exiting.")
            return

        # Create filename and save initial data
        start_date = self.data.index[0].strftime('%Y-%m-%d')
        self.filename = f"{self.save_folder}/{self.symbol.replace('/', '')}-{self.timeframe}{self.name_field}-{start_date}.csv"

        with ThreadPoolExecutor() as executor:
            executor.submit(self.save_data_to_csv, self.data)
        print(f"Starting dataset saved to {self.filename} with {len(self.data)} rows.")

        # Update last timestamp
        self.last_timestamp = self.data.index[-1]
        self.last_timestamp = self.last_timestamp.replace(second=0, microsecond=0)

        # Continuous data collection
        while True:
            next_interval = self.calculate_next_interval(self.last_timestamp)
            current_time = datetime.utcnow()

            if current_time < next_interval:
                time_to_sleep = (next_interval - current_time).total_seconds()
                print(f"Waiting for the next interval: {next_interval} (current time: {current_time})")
                print('\n')
                time.sleep(max(0, time_to_sleep))

            try:
                while True:
                    new_since = int(next_interval.timestamp() * 1000) - 1
                    print(f"Fetching new data with since={new_since} ({datetime.utcfromtimestamp(new_since / 1000)})")
                    print(f'(current time: {datetime.utcnow()})')
                    new_data = self.fetch_ohlcv(since=new_since, limit=1)
                    if not new_data.empty:
                        break
                    print(f"No new data available yet. Retrying...")
                    time.sleep(0.02)

                # Filter out overlapping rows
                #print('new_data.index', new_data.index)
                #print('self.last_timestamp', self.last_timestamp)
                new_data = new_data[new_data.index > self.last_timestamp]

                if not new_data.empty:
                    print("dropped new data: ", pd.DataFrame(new_data, columns=['open', 'high', 'low', 'close', 'volume']))
                    self.data = pd.concat([self.data, new_data])
                    self.last_timestamp = self.data.index[-1]

                    with ThreadPoolExecutor() as executor:
                        executor.submit(self.save_data_to_csv, new_data)
                    print(f"New data saved to {self.filename}. Dataset now has {len(self.data)} rows.")
            except Exception as e:
                print(f"Error fetching data: {e}")


In [15]:
# setup

import ccxt
hyperliquid = ccxt.hyperliquid()
#hyperliquid.set_sandbox_mode(True)  # enable sandbox mode - aka testnet
print(hyperliquid)


bot = DataCollectionBot(
    symbol="BTC/USDC:USDC",
    timeframe="1m",
    exchange=hyperliquid,  # Replace with your exchange instance
    dataset_size=5000,
    save_folder="DataCollectionByBot",
    name_field="Example-notebook"
)

bot.run()


Hyperliquid
Hi, I am a good bot for creating crypto ohlcv datasets from hyperliquid ;) made by Leonardo
Starting dataset saved to DataCollectionByBot/BTCUSDC:USDC-1m-Example-notebook-2024-11-24.csv with 5000 rows.
Fetching new data with since=1732714319999 (2024-11-27 13:31:59.999000)
(current time: 2024-11-27 13:32:56.837192)
dropped new data:                          open     high      low    close   volume
timestamp                                                       
2024-11-27 13:32:00  93854.0  93866.0  93763.0  93839.0  1.78159
New data saved to DataCollectionByBot/BTCUSDC:USDC-1m-Example-notebook-2024-11-24.csv. Dataset now has 5001 rows.
Waiting for the next interval: 2024-11-27 13:33:00 (current time: 2024-11-27 13:32:57.477628)


Fetching new data with since=1732714379999 (2024-11-27 13:32:59.999000)
(current time: 2024-11-27 13:33:00.003772)
No data received. Retrying...
No new data available yet. Retrying...
Fetching new data with since=1732714379999 (2024-11-27 13:32:59

KeyboardInterrupt: 

As you can see this API has a bit of latency, I think I can make the collection faster but it should be fine for most timeframes.