In [2]:
import os
import time
import json
import requests
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 60)


In [3]:
tm = pd.read_csv("config/token_master.csv")

with open("config/secrets.json", "r") as f:
    secrets = json.load(f)

ETHERSCAN_API_KEY = secrets["ETHERSCAN_API_KEY"]



In [5]:
# ===============================
# Helper Functions (Etherscan v2)
# ===============================

ETHERSCAN_BASE = "https://api.etherscan.io/v2/api"
ETH_CHAIN_ID = 1  # Ethereum mainnet


# ---- Address validation ----
def is_valid_eth_address(addr: str) -> bool:
    if not isinstance(addr, str):
        return False
    a = addr.strip()
    return a.startswith("0x") and len(a) == 42


# ---- Etherscan request wrapper (v2-safe) ----
def es_get(params, base_sleep=0.35, max_retries=6):
    """
    Robust Etherscan API v2 GET with retry & rate-limit handling
    """
    params = dict(params)
    params["apikey"] = ETHERSCAN_API_KEY
    params["chainid"] = ETH_CHAIN_ID

    for attempt in range(max_retries):
        r = requests.get(ETHERSCAN_BASE, params=params, timeout=30)

        if r.status_code == 429:
            time.sleep(2 * (2 ** attempt))
            continue

        r.raise_for_status()
        data = r.json()

        status = str(data.get("status", ""))
        result = data.get("result", "")

        if status == "0" and "rate limit" in str(result).lower():
            time.sleep(2 * (2 ** attempt))
            continue

        time.sleep(base_sleep)
        return data

    raise Exception("Etherscan repeatedly failed (rate limit or network).")


# ---- Contract verification ----
def check_contract_verified(contract_address):
    """
    Returns (verified: bool, note: str)
    """
    data = es_get({
        "module": "contract",
        "action": "getsourcecode",
        "address": contract_address
    })

    result = data.get("result")

    if isinstance(result, str):
        return False, result

    if not isinstance(result, list) or len(result) == 0:
        return False, "No source code info returned"

    src = result[0].get("SourceCode", "")
    verified = src is not None and str(src).strip() != ""

    return verified, ""


# ---- Fetch recent ERC20 transfers (paginated) ----
def fetch_tokentx_recent_pages(contract_address, pages=5, offset=1000, sort="desc"):
    """
    Pull up to pages * offset most recent ERC20 transfers
    """
    all_rows = []
    notes = []

    for page in range(1, pages + 1):
        data = es_get({
            "module": "account",
            "action": "tokentx",
            "contractaddress": contract_address,
            "page": page,
            "offset": offset,
            "sort": sort
        })

        status = str(data.get("status"))
        result = data.get("result")

        if status == "0":
            msg = str(result)
            if "No transactions found" in msg:
                break
            notes.append(msg)
            break

        if isinstance(result, list) and len(result) > 0:
            all_rows.extend(result)
            if len(result) < offset:
                break
        else:
            break

    return all_rows, " | ".join(notes)


def activity_distribution_features_recent(contract_address, pages=5, offset=1000):
    rows, note = fetch_tokentx_recent_pages(
        contract_address,
        pages=pages,
        offset=offset,
        sort="desc"
    )

    if not rows:
        return {
            "tx_sampled": 0,
            "unique_senders": 0,
            "unique_receivers": 0,
            "unique_addresses": 0,
            "top10_sender_share": np.nan,
            "top10_receiver_share": np.nan,
            "note": note or "No transfers returned"
        }

    df = pd.DataFrame(rows)

    sender_counts = df["from"].value_counts()
    receiver_counts = df["to"].value_counts()

    return {
        "tx_sampled": int(len(df)),
        "unique_senders": int(df["from"].nunique()),
        "unique_receivers": int(df["to"].nunique()),
        "unique_addresses": int(pd.concat([df["from"], df["to"]]).nunique()),
        "top10_sender_share": float(sender_counts.head(10).sum() / sender_counts.sum())
            if sender_counts.sum() else np.nan,
        "top10_receiver_share": float(receiver_counts.head(10).sum() / receiver_counts.sum())
            if receiver_counts.sum() else np.nan,
        "note": note
    }


In [6]:
addr = tm.loc[0, "contract_address"]
activity_distribution_features_recent(addr, pages=5, offset=1000)


{'tx_sampled': 5000,
 'unique_senders': 860,
 'unique_receivers': 1083,
 'unique_addresses': 1309,
 'top10_sender_share': 0.4282,
 'top10_receiver_share': 0.4318,
 'note': ''}

In [7]:
# ===============================
# Build Distribution MVP Dataset
# ===============================

results = []
run_errors = []

for _, row in tm.iterrows():
    cid = str(row["coingecko_id"]).strip()
    name = row["token_name"]
    sym = row["symbol"]
    addr = str(row["contract_address"]).strip()

    record = {
        "coingecko_id": cid,
        "token_name": name,
        "symbol": sym,
        "contract_address": addr,
        "tier": row.get("tier", ""),
        "category": row.get("category", ""),
        "chain": row.get("chain", "")
    }

    # Skip invalid addresses safely
    if not is_valid_eth_address(addr):
        record.update({
            "verified_contract": np.nan,
            "verify_note": "Missing/invalid contract address",
            "tx_sampled": np.nan,
            "unique_senders": np.nan,
            "unique_receivers": np.nan,
            "unique_addresses": np.nan,
            "top10_sender_share": np.nan,
            "top10_receiver_share": np.nan,
            "note": "Skipped"
        })
        results.append(record)
        continue

    try:
        verified, verify_note = check_contract_verified(addr)
        feats = activity_distribution_features_recent(
            addr,
            pages=5,        # 5000 tx window
            offset=1000
        )

        record.update({
            "verified_contract": bool(verified),
            "verify_note": verify_note,
            **feats
        })
        results.append(record)

    except Exception as e:
        record.update({
            "verified_contract": np.nan,
            "verify_note": "Error during Etherscan pull",
            "tx_sampled": np.nan,
            "unique_senders": np.nan,
            "unique_receivers": np.nan,
            "unique_addresses": np.nan,
            "top10_sender_share": np.nan,
            "top10_receiver_share": np.nan,
            "note": str(e)
        })
        results.append(record)
        run_errors.append({"coingecko_id": cid, "error": str(e)})

# Build DataFrame
dist_mvp = pd.DataFrame(results)

dist_mvp.head(), pd.DataFrame(run_errors)


(  coingecko_id token_name symbol                            contract_address  \
 0      uniswap    Uniswap    UNI  0x1f9840a85d5af5bf1d1762f925bdaddc4201f984   
 1    chainlink  Chainlink   LINK  0x514910771af9ca656af840dff83e8264ecf986ca   
 2         aave       Aave   AAVE  0x7fc66500c84a76ad7e9c93437bfc5ac33e2ddae9   
 3        maker      Maker    MKR  0x9f8f72aa9304c8b593d555f12ef6589cc3a579a2   
 4     lido-dao   Lido Dao    LDO  0x5a98fcbea516cf06857215779fd812ca3bef1b32   
 
   tier                     category     chain  verified_contract verify_note  \
 0    A                          DEX  ethereum               True               
 1    A  Infrastructure & Middleware  ethereum               True               
 2    A          Lending & Borrowing  ethereum               True               
 3    A          Lending & Borrowing  ethereum               True               
 4    A               Liquid staking  ethereum               True               
 
    tx_sampled  unique_s

In [8]:
# run the big loop
dist_mvp = pd.DataFrame(results)

dist_mvp["tx_sampled"].describe()


count      20.0
mean     5000.0
std         0.0
min      5000.0
25%      5000.0
50%      5000.0
75%      5000.0
max      5000.0
Name: tx_sampled, dtype: float64

In [9]:
import os
os.makedirs("data/processed", exist_ok=True)

dist_mvp.to_csv("data/processed/distribution_mvp_etherscan.csv", index=False)
"data/processed/distribution_mvp_etherscan.csv"


'data/processed/distribution_mvp_etherscan.csv'