## Robust Data Downloader
- Resume from the last stopped position

- Skip already downloaded products

- Save a list of failed ASINs

In [None]:
import pickle
import os
import keepa
import time
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import logging

# API Setup
accesskey = 'bijj0cgn2715qdc3hftclnv9s04282ialobgj7gau50f5atp0qe3qklfvdjxxxxx'
api = keepa.Keepa(accesskey)

# File paths setup
base_path = '/Users/data/'
raw_data_path = os.path.join(base_path, 'raw_data')
processed_data_path = os.path.join(base_path, 'processed_data')
asin_file_path = os.path.join(base_path, 'pet_asin_list.txt')  # Path to your ASIN list file
progress_file = os.path.join(base_path, 'download_progress.json')

2025-01-09 21:56:46,989 - INFO - Connecting to keepa using key ending in j2uehv
2025-01-09 21:56:47,460 - DEBUG - 0 tokens consumed
2025-01-09 21:56:47,462 - INFO - 60 tokens remain


In [None]:


# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('keepa_download.log'),
        logging.StreamHandler()
    ]
)


# Create directories if they don't exist
for path in [raw_data_path, processed_data_path]:
    os.makedirs(path, exist_ok=True)

def load_progress():
    """Load download progress from file"""
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            return json.load(f)
    return {'completed': [], 'failed': [], 'last_index': 0}

def save_progress(progress):
    """Save download progress to file"""
    with open(progress_file, 'w') as f:
        json.dump(progress, f)

def read_asin_list(file_path):
    """
    Read ASINs from a text file
    Args:
        file_path (str): Path to the text file containing ASINs
    Returns:
        list: List of ASINs
    """
    try:
        with open(file_path, 'r') as f:
            # Read lines, clean special characters, and remove empty lines
            asins = []
            for line in f.readlines():
                # Clean the ASIN: remove whitespace and special characters
                asin = line.strip().strip('\\{}').strip()
                if asin and len(asin) == 10:  # Valid ASINs are 10 characters
                    asins.append(asin)
                else:
                    logging.warning(f"Skipping invalid ASIN: {asin}")

        logging.info(f"Successfully loaded {len(asins)} valid ASINs from {file_path}")
        return asins
    except Exception as e:
        logging.error(f"Error reading ASIN file: {str(e)}")
        return []

def download_product_data(asins, max_retries=3, retry_delay=60):
    """
    Download and save product data from Keepa API with retry mechanism
    Args:
        asins (list): List of Amazon ASINs to download
        max_retries (int): Maximum number of retries per ASIN
        retry_delay (int): Delay in seconds between retries
    """
    # Load previous progress
    progress = load_progress()
    start_index = progress['last_index']

    total = len(asins)
    for index, asin in enumerate(asins[start_index:], start=start_index):
        if asin in progress['completed']:
            logging.info(f"[{index+1}/{total}] ASIN {asin} already downloaded, skipping")
            continue

        retries = 0
        while retries < max_retries:
            try:
                # Check if data already exists
                pickle_path = os.path.join(raw_data_path, f'{asin}_raw.pkl')

                if os.path.exists(pickle_path):
                    logging.info(f"[{index+1}/{total}] Data for ASIN {asin} already exists, skipping")
                    progress['completed'].append(asin)
                    break

                # Query product data
                products = api.query(asin, history=True, offers=20)
                product = products[0]

                # Save raw data using pickle
                with open(pickle_path, 'wb') as f:
                    pickle.dump(product, f)

                logging.info(f"[{index+1}/{total}] Data saved for ASIN {asin}")
                progress['completed'].append(asin)

                # Update progress
                progress['last_index'] = index
                save_progress(progress)

                # Sleep to respect API limits (1 token per minute)
                time.sleep(60)
                break

            except Exception as e:
                retries += 1
                error_msg = f"[{index+1}/{total}] Error processing ASIN {asin} (Attempt {retries}/{max_retries}): {str(e)}"
                if retries < max_retries:
                    logging.warning(f"{error_msg} - Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                else:
                    logging.error(error_msg)
                    if asin not in progress['failed']:
                        progress['failed'].append(asin)
                    save_progress(progress)

    # Final report
    logging.info("\nDownload Summary:")
    logging.info(f"Total ASINs: {total}")
    logging.info(f"Successfully downloaded: {len(progress['completed'])}")
    logging.info(f"Failed: {len(progress['failed'])}")
    if progress['failed']:
        logging.info("\nFailed ASINs:")
        for asin in progress['failed']:
            logging.info(asin)



In [None]:
# Run the download
if __name__ == "__main__":
    asins = read_asin_list(asin_file_path)
    if asins:
        logging.info(f"Starting download for {len(asins)} ASINs...")
        download_product_data(asins)
        logging.info("Download process completed!")

2025-01-09 21:57:46,993 - INFO - Successfully loaded 7512 valid ASINs from /Users/takedownccp/Documents/Cursor/DDU/data/pet_asin_list.txt
2025-01-09 21:57:46,994 - INFO - Starting download for 7512 ASINs...
2025-01-09 21:57:46,995 - INFO - [67/7512] ASIN B07RVD63J1 already downloaded, skipping
2025-01-09 21:57:46,999 - DEBUG - Executing single product query
2025-01-09 21:57:47,000 - DEBUG - Estimated time to complete 1 request(s) is 0.50 minutes
2025-01-09 21:57:47,000 - DEBUG - 	with a refill rate of 1 token(s) per minute
  0%|          | 0/1 [00:00<?, ?it/s]2025-01-09 21:57:47,415 - DEBUG - 0 tokens consumed
2025-01-09 21:57:49,144 - DEBUG - 1 tokens consumed
100%|██████████| 1/1 [00:02<00:00,  2.18s/it]
2025-01-09 21:57:49,213 - INFO - [68/7512] Data saved for ASIN B089PM6FRC
2025-01-09 21:58:49,226 - DEBUG - Executing single product query
2025-01-09 21:58:49,241 - DEBUG - Estimated time to complete 1 request(s) is 0.50 minutes
2025-01-09 21:58:49,242 - DEBUG - 	with a refill rate o

Response from server: NOT_ENOUGH_TOKEN


2025-01-10 00:42:43,529 - DEBUG - 0 tokens consumed
2025-01-10 00:42:48,659 - DEBUG - 0 tokens consumed
2025-01-10 00:42:56,359 - DEBUG - 6 tokens consumed
100%|██████████| 1/1 [11:05<00:00, 665.54s/it]
2025-01-10 00:42:56,473 - INFO - [97/7512] Data saved for ASIN B0081KTPQQ
2025-01-10 00:43:56,485 - DEBUG - Executing single product query
2025-01-10 00:43:56,504 - DEBUG - Estimated time to complete 1 request(s) is 5.92 minutes
2025-01-10 00:43:56,504 - DEBUG - 	with a refill rate of 1 token(s) per minute
  0%|          | 0/1 [00:00<?, ?it/s]2025-01-10 00:43:56,932 - DEBUG - 0 tokens consumed
2025-01-10 00:48:45,449 - DEBUG - 0 tokens consumed
2025-01-10 00:48:46,824 - DEBUG - 5 tokens consumed
100%|██████████| 1/1 [04:50<00:00, 290.38s/it]
2025-01-10 00:48:46,930 - INFO - [98/7512] Data saved for ASIN B01IRO7WR4
2025-01-10 00:49:46,938 - DEBUG - Executing single product query
2025-01-10 00:49:46,941 - DEBUG - Estimated time to complete 1 request(s) is 4.99 minutes
2025-01-10 00:49:46,

Response from server: NOT_ENOUGH_TOKEN


2025-01-10 02:49:10,730 - DEBUG - 0 tokens consumed
2025-01-10 02:50:21,090 - DEBUG - Executing single product query
2025-01-10 02:50:21,112 - DEBUG - Estimated time to complete 1 request(s) is 0.50 minutes
2025-01-10 02:50:21,114 - DEBUG - 	with a refill rate of 1 token(s) per minute
2025-01-10 02:50:21,713 - DEBUG - 0 tokens consumed
2025-01-10 02:55:12,375 - DEBUG - 0 tokens consumed
  0%|          | 0/1 [16:56<?, ?it/s]
2025-01-10 02:55:13,231 - DEBUG - 5 tokens consumed
100%|██████████| 1/1 [04:52<00:00, 292.18s/it]
2025-01-10 02:55:13,339 - INFO - [115/7512] Data saved for ASIN B08XY7D7BL
2025-01-10 02:56:13,371 - DEBUG - Executing single product query
2025-01-10 02:56:13,377 - DEBUG - Estimated time to complete 1 request(s) is 4.99 minutes
2025-01-10 02:56:13,378 - DEBUG - 	with a refill rate of 1 token(s) per minute
  0%|          | 0/1 [00:00<?, ?it/s]2025-01-10 02:56:14,012 - DEBUG - 0 tokens consumed
2025-01-10 03:00:13,679 - DEBUG - 0 tokens consumed
2025-01-10 03:00:15,617

Response from server: NOT_ENOUGH_TOKEN


2025-01-10 05:23:43,730 - DEBUG - 0 tokens consumed
2025-01-10 05:23:45,640 - DEBUG - 0 tokens consumed
  0%|          | 0/1 [17:08<?, ?it/s]
2025-01-10 05:23:56,206 - DEBUG - 12 tokens consumed
100%|██████████| 1/1 [11:03<00:00, 663.09s/it]
2025-01-10 05:23:56,300 - INFO - [136/7512] Data saved for ASIN B07QSCFSVH
2025-01-10 05:24:56,328 - DEBUG - Executing single product query
2025-01-10 05:24:56,338 - DEBUG - Estimated time to complete 1 request(s) is 11.97 minutes
2025-01-10 05:24:56,339 - DEBUG - 	with a refill rate of 1 token(s) per minute
  0%|          | 0/1 [00:00<?, ?it/s]2025-01-10 05:24:56,756 - DEBUG - 0 tokens consumed
2025-01-10 05:35:45,594 - DEBUG - 0 tokens consumed


Response from server: NOT_ENOUGH_TOKEN


2025-01-10 05:35:46,394 - DEBUG - 0 tokens consumed
2025-01-10 05:35:48,289 - DEBUG - 0 tokens consumed
2025-01-10 05:35:56,467 - DEBUG - 6 tokens consumed
100%|██████████| 1/1 [11:00<00:00, 660.19s/it]
2025-01-10 05:35:56,576 - INFO - [137/7512] Data saved for ASIN B085M9NBKX
2025-01-10 05:36:56,590 - DEBUG - Executing single product query
2025-01-10 05:36:56,598 - DEBUG - Estimated time to complete 1 request(s) is 5.97 minutes
2025-01-10 05:36:56,599 - DEBUG - 	with a refill rate of 1 token(s) per minute
  0%|          | 0/1 [00:00<?, ?it/s]2025-01-10 05:36:57,014 - DEBUG - 0 tokens consumed
2025-01-10 05:41:49,135 - DEBUG - 0 tokens consumed
2025-01-10 05:41:50,629 - DEBUG - 5 tokens consumed
100%|██████████| 1/1 [04:54<00:00, 294.06s/it]
2025-01-10 05:41:50,686 - INFO - [138/7512] Data saved for ASIN B09F8NLGCD
2025-01-10 05:42:50,709 - DEBUG - Executing single product query
2025-01-10 05:42:50,735 - DEBUG - Estimated time to complete 1 request(s) is 4.98 minutes
2025-01-10 05:42:5

Response from server: NOT_ENOUGH_TOKEN


2025-01-10 06:48:01,529 - DEBUG - 0 tokens consumed
2025-01-10 06:49:02,863 - DEBUG - 0 tokens consumed
2025-01-10 06:49:04,970 - DEBUG - 1 tokens consumed
100%|██████████| 1/1 [11:54<00:00, 714.81s/it]
2025-01-10 06:49:05,052 - INFO - [146/7512] Data saved for ASIN B01FZ6N0MI
2025-01-10 06:50:05,072 - DEBUG - Executing single product query
2025-01-10 06:50:05,078 - DEBUG - Estimated time to complete 1 request(s) is 0.50 minutes
2025-01-10 06:50:05,080 - DEBUG - 	with a refill rate of 1 token(s) per minute
  0%|          | 0/1 [00:00<?, ?it/s]2025-01-10 06:50:05,512 - DEBUG - 0 tokens consumed
2025-01-10 06:50:06,215 - DEBUG - 5 tokens consumed
100%|██████████| 1/1 [00:01<00:00,  1.17s/it]
2025-01-10 06:50:06,272 - INFO - [147/7512] Data saved for ASIN B07QG84VC8
2025-01-10 06:51:06,304 - DEBUG - Executing single product query
2025-01-10 06:51:06,331 - DEBUG - Estimated time to complete 1 request(s) is 3.94 minutes
2025-01-10 06:51:06,332 - DEBUG - 	with a refill rate of 1 token(s) per

Response from server: NOT_ENOUGH_TOKEN


2025-01-10 07:41:12,731 - DEBUG - 0 tokens consumed
2025-01-10 07:41:21,791 - DEBUG - 6 tokens consumed
100%|██████████| 1/1 [11:04<00:00, 664.19s/it]
2025-01-10 07:41:21,857 - INFO - [155/7512] Data saved for ASIN B0C1HG6XC8
2025-01-10 07:42:21,873 - DEBUG - Executing single product query
2025-01-10 07:42:21,883 - DEBUG - Estimated time to complete 1 request(s) is 6.00 minutes
2025-01-10 07:42:21,884 - DEBUG - 	with a refill rate of 1 token(s) per minute
  0%|          | 0/1 [00:00<?, ?it/s]2025-01-10 07:42:22,323 - DEBUG - 0 tokens consumed
2025-01-10 07:47:14,201 - DEBUG - 0 tokens consumed
2025-01-10 07:47:22,255 - DEBUG - 6 tokens consumed
100%|██████████| 1/1 [05:00<00:00, 300.41s/it]
2025-01-10 07:47:22,336 - INFO - [156/7512] Data saved for ASIN B006HX5PNQ
2025-01-10 07:48:22,351 - DEBUG - Executing single product query
2025-01-10 07:48:22,356 - DEBUG - Estimated time to complete 1 request(s) is 5.99 minutes
2025-01-10 07:48:22,357 - DEBUG - 	with a refill rate of 1 token(s) pe

KeyboardInterrupt: 