In [10]:
import requests
import pandas as pd
import time
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
# Function to get the top 1000 coins sorted by volume from CoinGecko
def get_top_1000_coins(api_key):
    all_coins = []
    page = 1
    while len(all_coins) < 1000:
        url = 'https://pro-api.coingecko.com/api/v3/coins/markets'
        params = {
            'vs_currency': 'usd',
            'order': 'volume_desc',
            'per_page': 250,  # Fetch 250 coins per page
            'page': page
        }
        headers = {'X-CG-Pro-API-Key': api_key}
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            coins = response.json()
            if not coins:
                break  # No more coins to fetch
            all_coins.extend(coins)
            page += 1
        else:
            print(f"Error fetching coin list: {response.status_code}, {response.text}")
            break
        time.sleep(1)  # To respect the API rate limit
    return all_coins[:1000]  # Return only the top 1000 coins

In [None]:
# Function to get historical market data for a coin (including volume)
def get_historical_data(coin_id, start_date, end_date, interval, api_key):
    url = f'https://pro-api.coingecko.com/api/v3/coins/{coin_id}/market_chart/range'
    params = {
        'vs_currency': 'usd',
        'from': int(start_date.timestamp()),
        'to': int(end_date.timestamp()),
        'interval': interval
    }
    headers = {'X-CG-Pro-API-Key': api_key}
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        data = response.json()
        prices = data.get('prices', [])
        volumes = data.get('total_volumes', [])
        return prices, volumes
    else:
        print(f"Error fetching data for {coin_id}: {response.status_code}, {response.text}")
        return [], []

In [None]:
# Function to fetch data for a single coin (including volume)
def fetch_coin_data(coin, start_date, end_date, interval, api_key):
    coin_id = coin['id']
    coin_name = coin['symbol'].upper()
    print(f'Fetching data for {coin_name} ({coin_id})...')
    try:
        prices, volumes = get_historical_data(coin_id, start_date, end_date, interval, api_key)
        if prices and volumes:
            df = pd.DataFrame(prices, columns=['timestamp', 'price'])
            df['volume'] = pd.DataFrame(volumes, columns=['timestamp', 'volume'])['volume']
            df['Date'] = pd.to_datetime(df['timestamp'], unit='ms')
            df.set_index('Date', inplace=True)
            df.drop('timestamp', axis=1, inplace=True)
            return coin_name, df
    except Exception as e:
        print(f'Error fetching data for {coin_name} ({coin_id}): {e}')
    return None, None

In [None]:
# Function to save intermediate data to disk
def save_data(coin_data, filename='CoinGecko_px_1D_temp.pkl'):
    combined_df = pd.concat(coin_data, axis=1)
    combined_df.columns = pd.MultiIndex.from_tuples(combined_df.columns)
    combined_df.to_pickle(filename)
    combined_df.to_csv(filename.replace('.pkl', '.csv'))
    print(f'Data saved to {filename}')

In [None]:
# Your CoinGecko API key
api_key = ''

# Define the start and end dates for the data
start_date = pd.Timestamp('2018-01-01')
end_date = pd.Timestamp.now()
interval = 'daily'

# Get the list of the top 1000 coins sorted by volume
coins = get_top_1000_coins(api_key)

# Initialize a dictionary to store the data
coin_data = {}

# Load the checkpoint if it exists
checkpoint_file = 'checkpoint.txt'
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'r') as f:
        processed_coins = set(f.read().splitlines())
else:
    processed_coins = set()

In [None]:
# Use ThreadPoolExecutor to fetch data concurrently
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {}
    for coin in coins:
        coin_id = coin['id']
        if coin_id in processed_coins:
            continue  # Skip already processed coins
        future = executor.submit(fetch_coin_data, coin, start_date, end_date, interval, api_key)
        futures[future] = coin_id
    
    for i, future in enumerate(as_completed(futures), 1):
        coin_name, df = future.result()
        if coin_name and df is not None:
            coin_data[coin_name] = df
            # Save checkpoint
            with open(checkpoint_file, 'a') as f:
                f.write(futures[future] + '\n')

        # Periodically save the data
        if i % 50 == 0:  # Save every 50 coins
            save_data(coin_data, filename='CoinGecko_px_vol_1D.pkl')

In [None]:
# Final save of all data
save_data(coin_data, filename='CoinGecko_px_vol_1D.pkl')

# Clean up checkpoint file
if os.path.exists(checkpoint_file):
    os.remove(checkpoint_file)

print('Data fetching and saving complete.')