In [1]:
import pandas as pd

tax_ids = ["30487219", "13304871", "34470937"]

df = pd.DataFrame({"tax_id": tax_ids})

In [4]:
from fin_groups.db import OwnershipDB
from fin_groups.crawler import CompanyCrawler
import os
import time
import random
from typing import List

# 1. Initialize
db = OwnershipDB("ownership.db")
crawler = CompanyCrawler(db=db)

def run_company_crawler(db, tax_ids: List[str], cache_file: str = "processed_ids.txt", delay_range: tuple = (0.5, 1.5)):
    """
    Runs the crawler with a local cache file to prevent redundant calls after a crash.
    """
    crawler = CompanyCrawler(db=db)
    
    # 1. Load already processed IDs from cache
    processed_ids = set()
    if os.path.exists(cache_file):
        with open(cache_file, "r") as f:
            processed_ids = set(line.strip() for line in f)
    
    # 2. Filter the input list to exclude already processed ones
    pending_ids = [str(tid).zfill(8) for tid in tax_ids if str(tid).zfill(8) not in processed_ids]
    total_original = len(tax_ids)
    total_pending = len(pending_ids)
    
    print(f"?? Cache: {len(processed_ids)} already processed.")
    print(f"?? Starting crawl for {total_pending} remaining companies (Total: {total_original})...\n")
    
    stats = {"success": 0, "failed": 0, "empty": 0}

    for index, tid_str in enumerate(pending_ids, 1):
        retries = 3
        success = False
        
        while retries > 0 and not success:
            try:
                time.sleep(random.uniform(*delay_range))
                
                owners = crawler.crawl_company(tid_str)
                
                # We consider it "processed" if it succeeded OR if it's confirmed empty
                # because we don't want to re-scrape empty companies every time.
                if owners:
                    print(f"[{index}/{total_pending}] ID: {tid_str} ? Success ({len(owners)} owners)")
                    stats["success"] += 1
                else:
                    print(f"[{index}/{total_pending}] ID: {tid_str} ??  No owners found")
                    stats["empty"] += 1
                
                # 3. Update Cache File immediately after success
                with open(cache_file, "a") as f:
                    f.write(f"{tid_str}\n")
                
                success = True
                
            except Exception as e:
                retries -= 1
                if retries > 0:
                    wait = 5 * (3 - retries)
                    print(f"[{index}/{total_pending}] ID: {tid_str} ?? Error: {e}. Retrying...")
                    time.sleep(wait)
                else:
                    print(f"[{index}/{total_pending}] ID: {tid_str} ? Failed: {e}")
                    stats["failed"] += 1

    print(f"\n{'='*30}\n?? CRAWL COMPLETE\n? New: {stats['success']}\n??  Empty: {stats['empty']}\n? Failed: {stats['failed']}\n{'='*30}")

# Execution
run_company_crawler(db, df['tax_id'].tolist())

?? Cache: 0 already processed.
?? Starting crawl for 3 remaining companies (Total: 3)...

[{'name': 'СКАДІ ХОЛДІНГС ЛІМІТЕД', 'profile_link': None, 'country': 'Кіпр', 'role': 'Засновник', 'amount_uah': 3309719, 'share_percent': 33}, {'name': 'СОНАТО ХОЛДІНГС ЛІМІТЕД', 'profile_link': None, 'country': 'Кіпр', 'role': 'Засновник', 'amount_uah': 2780442, 'share_percent': 28}, {'name': 'БІ ДЖИ ВІ ГРУП ЛІМІТЕД', 'profile_link': None, 'country': 'Кіпр', 'role': 'Засновник', 'amount_uah': 2780442, 'share_percent': 28}, {'name': 'Буткевич Геннадій Владиславович', 'profile_link': 'https://opendatabot.ua/p/butkevych-hennadii-vladyslavovych-cfeIGNgWbGpK_W_P71qNGA', 'country': 'Україна', 'role': 'Засновник', 'amount_uah': 530270, 'share_percent': 34}, {'name': 'Єрмаков Євгеній Петрович', 'profile_link': 'https://opendatabot.ua/p/yermakov-yevhenii-petrovych-jh08OBDGSgm0BUaAYQGldw', 'country': 'Україна', 'role': 'Засновник', 'amount_uah': 529277, 'share_percent': 33}, {'name': 'Буткевич Геннадій Вла