In [1]:
# alpha_discoveryV2.1
import os
import json
import random
import warnings
import functools
import itertools
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

warnings.filterwarnings('ignore')

# =========================
# DEFINITIONS & CONFIG
# =========================

# Reproducibility
RANDOM_SEED = 67
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Tradable tickers (what is actually traded/evaluated)
TRADABLE_TICKERS = [
    'QQQ US Equity', 'SPY US Equity', 'XLK US Equity', 'XLF US Equity',
    'XLE US Equity', 'ARKK US Equity', 'VIX Index', 'GLD US Equity',
    'NBIS US Equity', 'LLY US Equity', 'TSLA US Equity', 'AAPL US Equity',
    'NVDA US Equity'
]

# Single-ticker dev mode (None for all)
SINGLE_TICKER_MODE = None
if SINGLE_TICKER_MODE:
    print(f"\n--- RUNNING IN SINGLE TICKER MODE FOR: {SINGLE_TICKER_MODE} ---")
    if SINGLE_TICKER_MODE not in TRADABLE_TICKERS:
        raise ValueError(f"Ticker '{SINGLE_TICKER_MODE}' not found in TRADABLE_TICKERS.")
    TRADABLE_TICKERS = [SINGLE_TICKER_MODE]

# Macro tickers (not traded but used for discovery)
MACRO_TICKERS = [
    'DXY Curncy', 'USGG10YR Index', 'USGG2YR Index', 'CPI YOY Index',
    'INJCJC Index', 'FFA Comdty', 'LF94TRUU Index', 'CPI CHNG Index',
    'NFP TCH Index', 'JOBS US Equity', 'CTII10 Govt', 'USSW10 Curncy',
    'MLCX3CRT Index', 'FARBAST Index', 'BSPGCPUS Index', 'SPCSUSA Index',
    'SPCS20SM Index', 'CONSSENT Index', 'CO1 Comdty'
]

# Files
MAIN_DATA_FILE = 'All_tickers_new.xlsx'
MACRO_DATA_FILE = 'Macro_tickers_new.xlsx'

# GA config
NUM_GENERATIONS = 20
POPULATION_SIZE = 100
SETUP_LENGTHS_TO_EXPLORE = [2, 3]
ELITISM_RATE = 0.1
MUTATION_RATE = 0.20

# General
MIN_INITIAL_SUPPORT_FILTER = 5
OPTION_SIM_HORIZONS_DAYS = [1, 3, 10, 21]
RISK_FREE_RATE = 0.01
RECENCY_WINDOW = 10                         # number of most recent triggers for recency Sharpe
HOLDOUT_START_DATE = '2023-08-12'           # to compute OOS Sharpe on holdout

# ----- Export breadth controls -----
KEEP_TOP_RANKS = 5     # widen beyond rank-1 Pareto (try 5–7 if you want hundreds)
MIN_SORTINO   = 0.40   # quality floor (lower -> more setups)
MIN_SUPPORT   = 8      # avoid unicorns (lower -> more setups)
MAX_RESULTS   = 1000    # cap exported pool (raise to 1000 if needed)
EXPORT_PARETO_FRONT = True      # keep true
EXPORT_WIDENED_POOL = True      # add: export the candidate pool snapshot too


# Final results filters (actionability)
RESULTS_LAST_TRIGGER_DAYS = None
RESULTS_DATE_RANGE = {'start':'2025-06-01','end':'2025-08-12'}

# Output: per-run folder tagged with seed + timestamp (and SINGLE_TICKER_MODE if set)
RUN_TAG = f"seed{RANDOM_SEED}_{pd.Timestamp.now():%Y%m%d_%H%M%S}"
if SINGLE_TICKER_MODE:
    RUN_TAG += f"_{SINGLE_TICKER_MODE.replace(' ', '_').replace('/', '-')}"
OUTPUT_DIR = os.path.join("runs", RUN_TAG)
os.makedirs(OUTPUT_DIR, exist_ok=True)
with open(os.path.join(OUTPUT_DIR, 'run_config.json'), 'w') as _f:
    json.dump({
        "RANDOM_SEED": RANDOM_SEED,
        "NUM_GENERATIONS": NUM_GENERATIONS,
        "POPULATION_SIZE": POPULATION_SIZE,
        "SETUP_LENGTHS_TO_EXPLORE": SETUP_LENGTHS_TO_EXPLORE,
        "ELITISM_RATE": ELITISM_RATE,
        "MUTATION_RATE": MUTATION_RATE,
        "MIN_INITIAL_SUPPORT_FILTER": MIN_INITIAL_SUPPORT_FILTER,
        "RECENCY_WINDOW": RECENCY_WINDOW,
        "HOLDOUT_START_DATE": HOLDOUT_START_DATE,
        "TRADABLE_TICKERS": TRADABLE_TICKERS,
    }, _f, indent=2)

print('Loading raw workbooks…')

# =========================
# DATA LOADING
# =========================

def load_and_merge_excel(file_path, header_row, existing_df=None):
    """Loads an Excel file, prepends sheet names to columns (except Date), and merges."""
    try:
        xls = pd.ExcelFile(file_path)
        current_df = existing_df.copy() if existing_df is not None else None
        for sh_name in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name=sh_name, header=header_row)
            if 'Dates' in df.columns and 'Date' not in df.columns:
                df.rename(columns={'Dates': 'Date'}, inplace=True)
            if 'Date' not in df.columns:
                print(f"Warning: Sheet '{sh_name}' in '{file_path}' missing 'Date'/'Dates' column. Skipping.")
                continue
            df.columns = [f"{sh_name}_{col}" if col != 'Date' else col for col in df.columns]
            if current_df is None:
                current_df = df
            else:
                df = df.loc[:, ~df.columns.duplicated()]
                current_df = current_df.merge(df, on='Date', how='outer')
        return current_df
    except FileNotFoundError:
        print(f"Error: '{file_path}' not found.")
        return existing_df
    except Exception as e:
        print(f"Unexpected error during Excel loading of '{file_path}': {e}")
        return existing_df

# Load main then macro
raw = load_and_merge_excel(MAIN_DATA_FILE, header_row=1)
if raw is not None and not raw.empty:
    raw = load_and_merge_excel(MACRO_DATA_FILE, header_row=4, existing_df=raw)
else:
    print("Main data could not be loaded, skipping macro data.")
    raw = pd.DataFrame()

# Clean/index
if not raw.empty:
    raw = raw.sort_values('Date').reset_index(drop=True)
    raw.fillna(method='ffill', inplace=True)
    raw['Date'] = pd.to_datetime(raw['Date'])
    raw = raw.drop_duplicates(subset=['Date'], keep='last')
    raw.set_index('Date', inplace=True)
    raw.index = pd.to_datetime(raw.index)
    raw.sort_index(inplace=True)
else:
    print("No data loaded. Raw DataFrame is empty.")
print('Raw shape:', raw.shape)

# Ticker identification
all_column_prefixes = sorted(list(set([c.split('_')[0] for c in raw.columns if '_' in c])))
COMMON_FEATURE_PREFIXES = ['Last', 'Open', 'High', 'Low', 'VWAP', 'Volume', 'IVOL', 'Implied', 'Total', '30', '10',
                           '60', 'Hist.', '1st', 'Put', 'Dates', 'CHG', 'FFA', 'INJCJC', 'NFP', 'JOBS', 'CPI', 'CTII10',
                           'LF94TRUU', 'SPX', 'USSW10', 'MLCX3CRT', 'FARBAST', 'BSPGCPUS', 'SPCSUSA', 'SPCS20SM',
                           'CONSSENT']
actual_ticker_prefixes = [p for p in all_column_prefixes if p not in COMMON_FEATURE_PREFIXES]
all_tickers = sorted(list(set(TRADABLE_TICKERS + actual_ticker_prefixes + MACRO_TICKERS)))
print(f'\nIdentified all relevant prefixes/tickers for feature engineering: {len(all_tickers)}')

# =========================
# HELPERS
# =========================

def first_col_containing(ticker_full_name, substr=''):
    if substr == 'PX_LAST':
        for potential_col in [f"{ticker_full_name}_Last_Price_PX_LAST", f"{ticker_full_name}_PX_LAST"]:
            if potential_col in raw.columns:
                return potential_col
    for c in raw.columns:
        if c.startswith(ticker_full_name) and substr in c:
            return c
    return None

_series_cache = {}
def safe_series(col_name, use_cache=True):
    if use_cache and col_name in _series_cache:
        return _series_cache[col_name]
    series = raw[col_name] if col_name and col_name in raw.columns else pd.Series(index=raw.index, dtype=float)
    if use_cache:
        _series_cache[col_name] = series
    return series

def zscore_rolling(s, win=252, eps=1e-9, minp=60):
    m = s.rolling(win, min_periods=minp).mean()
    v = s.rolling(win, min_periods=minp).std()
    return (s - m) / (v.replace(0, eps))

def mad_z(s, win=252, c=1.4826, minp=60):
    med = s.rolling(win, min_periods=minp).median()
    mad = (s - med).abs().rolling(win, min_periods=minp).median()
    return (s - med) / (c * mad.replace(0, np.nan))

def rolling_pct_of_last(s, win=252, minp=60):
    def pct_last(a):
        a = a[~np.isnan(a)]
        if a.size == 0:
            return np.nan
        return np.sum(a <= a[-1]) / a.size
    return s.rolling(win, min_periods=minp).apply(pct_last, raw=True)

def norm_feature(s, method='z', win=252):
    if s.empty or s.isna().all():
        return s
    if method == 'z':
        return zscore_rolling(s, win)
    if method == 'madz':
        return mad_z(s, win)
    if method == 'pct':
        return rolling_pct_of_last(s, win)
    return s

def returns(px, periods=1):
    return px.pct_change(periods)

def to_bp(yield_series):
    q95 = yield_series.dropna().quantile(0.95)
    multiplier = 100.0 if (q95 is not None and q95 <= 20.0) else 10000.0
    return yield_series * multiplier

def align2(s1, s2):
    df = pd.concat([s1, s2], axis=1).dropna()
    if df.empty:
        return pd.Series(dtype=float), pd.Series(dtype=float)
    return df.iloc[:, 0], df.iloc[:, 1]

def rolling_corr_fisher(s1, s2, win=20, minp_corr=20):
    r1, r2 = align2(s1, s2)
    if r1.empty:
        return pd.Series(dtype=float, index=s1.index), pd.Series(dtype=float, index=s1.index)
    r = r1.rolling(win, min_periods=minp_corr).corr(r2).clip(-0.999999, 0.999999)
    f = 0.5 * np.log((1 + r) / (1 - r))
    return f, r

def rolling_corr_fisher_z(s1, s2, win_corr=20, win_z=60, minp_corr=20, minp_z=40):
    f, _ = rolling_corr_fisher(s1, s2, win=win_corr, minp_corr=minp_corr)
    return zscore_rolling(f, win=win_z, minp=minp_z)

def beta_rolling(s1, s2, win=60, eps=1e-12, minp=40):
    r1, r2 = align2(s1, s2)
    if r1.empty:
        return pd.Series(dtype=float, index=s1.index)
    cov = r1.rolling(win, min_periods=minp).cov(r2)
    var = r2.rolling(win, min_periods=minp).var()
    return cov / var.replace(0, eps)

def cooccur(a, b, window=2):
    return a.rolling(window).max().astype(bool) & b.rolling(window).max().astype(bool)

def get_realized_vol(price_series, win=30):
    log_returns = np.log(price_series / price_series.shift(1))
    return log_returns.rolling(win).std() * np.sqrt(252)

def event_delta(series):
    s_ffill = series.ffill()
    change = s_ffill.diff()
    return change.where(series.notna())

def coverage_activation(feat_df, hi=1.5):
    rows = []
    for c in feat_df.columns:
        s = feat_df[c].dropna()
        if s.empty:
            continue
        cov = len(s) / len(feat_df)
        z_col_name = f"{c}_z"
        if z_col_name in feat_df.columns:
            z = feat_df[z_col_name].dropna()
        else:
            is_z_like = '_z' in c or (s.mean() < 0.1 and 0.5 < s.std() < 1.5)
            z = s if is_z_like else norm_feature(s)
        act = (z.abs() > hi).mean() if not z.empty else 0
        rows.append((c, round(cov, 3), round(act, 3)))
    report = pd.DataFrame(rows, columns=['feature', 'coverage_pct', 'activation_pct']).sort_values('activation_pct', ascending=False)
    print("--- Feature Coverage & Activation Report ---")
    print(report.to_string())
    return report

def frac_diff(series, d=0.5, window=100):
    weights = [1.0]
    for k in range(1, len(series)):
        weights.append(-weights[-1] * (d - k + 1) / k)
    weights = np.array(weights[::-1])
    output = pd.Series(index=series.index, dtype=float)
    for i in range(window, len(series)):
        subset = series.iloc[i - window + 1: i + 1]
        if len(subset) == len(weights[-window:]):
            output.iloc[i] = np.dot(weights[-window:], subset)
    return output.dropna()

def block_bootstrap_sharpe(returns_series, block_size, num_iterations=1000, annualize=True, trading_days_per_year=252):
    returns_series = returns_series.dropna()
    if len(returns_series) < block_size or len(returns_series) < 2:
        return 0.0, 0.0, 0.0
    blocks = [returns_series.iloc[i: i + block_size] for i in range(0, len(returns_series), block_size) if not returns_series.iloc[i: i + block_size].empty]
    if not blocks:
        return 0.0, 0.0, 0.0
    n_blocks_to_sample = int(np.ceil(len(returns_series) / block_size))
    sharpes = []
    for _ in range(num_iterations):
        resampled_returns_list = [blocks[i] for i in np.random.choice(len(blocks), n_blocks_to_sample, replace=True)]
        resampled_returns = pd.concat(resampled_returns_list).iloc[:len(returns_series)]
        if resampled_returns.std() > 1e-9:
            sharpes.append((resampled_returns.mean() / resampled_returns.std()) * (np.sqrt(trading_days_per_year) if annualize else 1))
        else:
            sharpes.append(0.0)
    if not sharpes:
        return 0.0, 0.0, 0.0
    return np.median(sharpes), np.percentile(sharpes, 5), np.percentile(sharpes, 95)

def calculate_sortino_ratio(returns_series, annualize=True, trading_days_per_year=252):
    returns_series = returns_series.dropna()
    if len(returns_series) < 2:
        return 0.0
    target_return = 0
    downside_returns = returns_series[returns_series < target_return]
    if len(downside_returns) == 0:
        return np.inf
    expected_return = returns_series.mean()
    downside_std = downside_returns.std()
    if downside_std == 0:
        return np.inf
    sortino = (expected_return - target_return) / downside_std
    return sortino * np.sqrt(trading_days_per_year) if annualize else sortino

def calculate_calmar_ratio(returns_series, annualize=True, trading_days_per_year=252):
    returns_series = returns_series.dropna()
    if len(returns_series) < 2:
        return 0.0
    cumulative_returns = (1 + returns_series).cumprod()
    peak = cumulative_returns.cummax()
    drawdown = (cumulative_returns - peak) / peak
    max_drawdown = drawdown.min()
    if max_drawdown == 0:
        return np.inf
    total_return = cumulative_returns.iloc[-1] - 1
    if (1 + total_return) < 0:
        return -99
    num_years = len(returns_series) / trading_days_per_year
    annualized_return = (1 + total_return) ** (1 / num_years) - 1
    calmar = annualized_return / abs(max_drawdown)
    return calmar

# Option sim helpers
def estimate_atm_premium(price, ivol, days, option_type):
    T = days / 365.25
    if T <= 0 or price <= 0 or ivol <= 0:
        return 0
    return 0.4 * price * ivol * np.sqrt(T)

def simulate_option_pnl_detailed(current_price, future_price, ivol_at_entry, horizon_days, entry_direction):
    underlying_return = (future_price - current_price) / current_price if current_price and pd.notna(current_price) and pd.notna(future_price) else np.nan
    nan_result = {'pnl_per_share': np.nan, 'option_type': None, 'strike_price': np.nan, 'entry_premium': np.nan,
                  'exit_value': np.nan, 'pnl_dollars': np.nan, 'pnl_pct': np.nan, 'skipped_reason': 'None',
                  'Underlying_Exit_Price': future_price if pd.notna(future_price) else np.nan, 'Return_Underlying': underlying_return}
    if pd.isna(current_price) or current_price <= 0:
        nan_result['skipped_reason'] = 'Invalid Entry Price'; return nan_result
    if pd.isna(ivol_at_entry) or ivol_at_entry <= 0:
        nan_result['skipped_reason'] = 'Invalid IVOL'; return nan_result
    if pd.isna(future_price):
        nan_result['skipped_reason'] = 'Missing Future Price'; return nan_result
    if entry_direction not in ['long', 'short']:
        nan_result['skipped_reason'] = 'Invalid Entry Direction'; return nan_result

    scaled_ivol = ivol_at_entry / 100.0 if ivol_at_entry > 1.0 else ivol_at_entry
    strike_price = current_price
    option_type = 'call' if entry_direction == 'long' else 'put'
    entry_premium = estimate_atm_premium(current_price, scaled_ivol, horizon_days, option_type)

    if option_type == 'call':
        exit_value = max(future_price - strike_price, 0)
    else:
        exit_value = max(strike_price - future_price, 0)

    pnl_per_share = exit_value - entry_premium
    pnl_dollars = pnl_per_share * 100
    pnl_pct = (pnl_per_share / entry_premium) * 100 if entry_premium > 0 else np.nan

    return {'pnl_per_share': pnl_per_share, 'option_type': option_type, 'strike_price': strike_price,
            'entry_premium': entry_premium, 'exit_value': exit_value, 'pnl_dollars': pnl_dollars,
            'pnl_pct': pnl_pct, 'skipped_reason': 'None', 'Underlying_Exit_Price': future_price,
            'Return_Underlying': underlying_return}

# =========================
# FEATURE SPECS
# =========================

print('\n--- Defining ALL Feature Specifications ---')
feature_specs = []

# Volatility Features
for ticker in all_tickers:
    f60 = '60_Day_Call_Implied_Volatility'; f10 = '10_Day_Call_Implied_Volatility'
    feature_specs.append({'type': 'ivol_term_structure', 'assets': [ticker], 'params': {'f_long': f60, 'f_short': f10},
                          'unique_id': f'term_structure_{f60}-{f10}__{ticker}', 'display_name': f"diff({f60}, {f10})__{ticker}"})
    put50 = '1st_Month_Put_Imp_Vol_50_Delta'; call40 = '1st_Month_Call_Imp_Vol_40_Delta'
    feature_specs.append({'type': 'ivol_skew', 'assets': [ticker], 'params': {'put': put50, 'call': call40},
                          'unique_id': f'skew_{put50}-{call40}__{ticker}', 'display_name': f"diff({put50}, {call40})__{ticker}"})
    for suffix in ['IVOL_SIGMA', 'CALL_IMP_VOL_30D', 'PUT_IMP_VOL_30D']:
        feature_specs.append({'type': 'ivol_shock', 'assets': [ticker], 'params': {'ivol_suffix': suffix, 'window': 30},
                              'unique_id': f'zscore_{suffix}_30d__{ticker}', 'display_name': f"zscore_{suffix}_30d__{ticker}"})
        feature_specs.append({'type': 'ivol_div_volume', 'assets': [ticker], 'params': {'ivol_suffix': suffix, 'vol_suffix': 'VOLUME'},
                              'unique_id': f'div_{suffix}_by_VOLUME__{ticker}', 'display_name': f"div({suffix}, VOLUME)__{ticker}"})

# Deriv Flow & Sentiment
for ticker in all_tickers:
    pc_ratio_col = 'PUT_CALL_VOLUME_RATIO_CUR_DAY'
    feature_specs.append({'type': 'put_call_ratio_ema', 'assets': [ticker], 'params': {'span': 5, 'col': pc_ratio_col},
                          'unique_id': f'ema5_{pc_ratio_col}__{ticker}', 'display_name': f"ema5_{pc_ratio_col}__{ticker}"})
    oi_col = 'OPEN_INT_TOTAL_CALL'
    feature_specs.append({'type': 'open_interest_change', 'assets': [ticker], 'params': {'days': 3, 'col': oi_col},
                          'unique_id': f'pct_change_{oi_col}_3d__{ticker}', 'display_name': f"pct_change_{oi_col}_3d__{ticker}"})
    vol_col = 'Volume_-Realtime_VOLUME'
    feature_specs.append({'type': 'volume_zscore', 'assets': [ticker], 'params': {'window': 30, 'col': vol_col},
                          'unique_id': f'zscore_{vol_col}_30d__{ticker}', 'display_name': f"zscore_{vol_col}_30d__{ticker}"})
    sm_oi = 'OPEN_INT_TOTAL_CALL'; sm_ivol = '10_Day_Call_Implied_Volatility'
    feature_specs.append({'type': 'smart_money_flag', 'assets': [ticker], 'params': {'oi_col': sm_oi, 'ivol_col': sm_ivol},
                          'unique_id': f'smart_money_{sm_oi}_{sm_ivol}__{ticker}', 'display_name': f"smart_money(pct_change({sm_oi}) > 0 AND pct_change({sm_ivol}) > 0)__{ticker}"})

# Generic Z-Scores for sequential patterns
for ticker in all_tickers:
    for col in ['PX_LAST', 'IVOL_SIGMA', 'Volume_-Realtime_VOLUME']:
        for window in [30, 60]:
            feature_specs.append({'type': 'generic_zscore', 'assets': [ticker], 'params': {'col': col, 'window': window},
                                  'unique_id': f'zscore_{col}_{window}d__{ticker}', 'display_name': f"zscore({col}, {window}d)__{ticker}"})

# Cross-Asset Correlations (price)
price_col = 'PX_LAST'
correlation_pairs = list(set(itertools.combinations(all_tickers, 2)))
for t1, t2 in correlation_pairs:
    for window in [20, 60]:
        feature_specs.append({'type': 'correlation', 'assets': [t1, t2], 'params': {'window': window, 'col': price_col},
                              'unique_id': f'corr_{t1}:{price_col}_{t2}:{price_col}_{window}d', 'display_name': f"corr({t1}:{price_col}, {t2}:{price_col}, {window}d)"})
    feature_specs.append({'type': 'correlation_zscore', 'assets': [t1, t2], 'params': {'col': price_col, 'window': 60},
                          'unique_id': f'zscore_corr20d_{t1}:{price_col}_{t2}:{price_col}_60d', 'display_name': f"zscore_corr(20d)({t1}:{price_col}, {t2}:{price_col}, 60d)"})
    feature_specs.append({'type': 'correlation_delta', 'assets': [t1, t2], 'params': {'col': price_col},
                          'unique_id': f'corr_delta_{t1}:{price_col}_{t2}:{price_col}', 'display_name': f"corr_delta(20d-60d)({t1}:{price_col}, {t2}:{price_col})"})
    feature_specs.append({'type': 'rolling_beta', 'assets': [t1, t2], 'params': {'window': 60, 'col': price_col},
                          'unique_id': f'beta_{t1}:{price_col}_{t2}:{price_col}_60d', 'display_name': f"beta({t1}:{price_col}, {t2}:{price_col}, 60d)"})

# Advanced Correlations (specific combos)
adv_corr_defs = [
    {'t1': 'QQQ US Equity', 'f1': 'IVOL_SIGMA', 't2': 'SPY US Equity', 'f2': 'IVOL_SIGMA', 'win': 30},
    {'t1': 'TSLA US Equity', 'f1': 'Volume_-Realtime_VOLUME', 't2': 'VIX Index', 'f2': 'IVOL_SIGMA', 'win': 20},
    {'t1': 'CO1 Comdty', 'f1': 'PX_LAST', 't2': 'XLE US Equity', 'f2': 'IVOL_SIGMA', 'win': 30},
    {'t1': 'USGG10YR Index', 'f1': 'PX_LAST', 't2': 'XLF US Equity', 'f2': 'IVOL_SIGMA', 'win': 30}
]
for d in adv_corr_defs:
    feature_specs.append({
        'type': 'advanced_correlation',
        'assets': [d['t1'], d['t2']],
        'params': {'window': d['win'], 'col1': d['f1'], 'col2': d['f2']},
        'unique_id': f"corr_{d['t1']}:{d['f1']}_{d['t2']}:{d['f2']}_{d['win']}d",
        'display_name': f"corr({d['t1']}:{d['f1']}, {d['t2']}:{d['f2']}, {d['win']}d)"
    })

# Macro Features
feature_specs.extend([
    {'type': 'macro_mpi', 'assets': ['DXY Curncy', 'USGG10YR Index'], 'unique_id': 'macro_mpi', 'display_name': 'Macro Pressure Index'},
    {'type': 'macro_fear_overdrive', 'assets': ['VIX Index', 'DXY Curncy', 'SPY US Equity'], 'unique_id': 'macro_fear_overdrive', 'display_name': 'Fear Overdrive'},
    {'type': 'macro_sector_rotation', 'assets': ['XLK US Equity', 'XLE US Equity'], 'unique_id': 'macro_xlk_xle_rotation', 'display_name': 'Sector Rotation (XLK-XLE)'},
    {'type': 'macro_yield_spread', 'assets': ['USGG10YR Index', 'USGG2YR Index'], 'unique_id': 'macro_10y2y_spread', 'display_name': 'Yield Spread (10Y-2Y)'},
    {'type': 'macro_cpi_zscore', 'assets': ['CPI YOY Index'], 'unique_id': 'macro_cpi_z', 'display_name': 'CPI Z-Score'},
    {'type': 'macro_injcjc_shock', 'assets': ['INJCJC Index'], 'unique_id': 'macro_jobless_claims_shock', 'display_name': 'Jobless Claims Shock'},
    {'type': 'macro_ffa_spread', 'assets': ['FFA Comdty', 'USGG2YR Index'], 'unique_id': 'macro_ffa_spread', 'display_name': 'Fed Funds Spread'},
    {'type': 'macro_lf94truu_vol_signal', 'assets': ['LF94TRUU Index'], 'unique_id': 'macro_hyg_vol_signal', 'display_name': 'HYG Vol Signal'}
])
for t in ['CPI YOY Index', 'CPI CHNG Index', 'NFP TCH Index', 'JOBS US Equity']:
    feature_specs.append({'type': 'macro_generic_mom', 'assets': [t], 'params': {'days': 3},
                          'unique_id': f'macro_mom3_{t}', 'display_name': f'Macro Mom3d({t})'})
for t in ['CTII10 Govt', 'USSW10 Curncy', 'MLCX3CRT Index', 'FARBAST Index', 'BSPGCPUS Index', 'SPCSUSA Index', 'SPCS20SM Index', 'CONSSENT Index']:
    feature_specs.append({'type': 'macro_generic_chg', 'assets': [t],
                          'unique_id': f'macro_chg_{t}', 'display_name': f'Macro Chg({t})'})

# Momentum / fractal
for ticker in all_tickers:
    feature_specs.append({'type': 'mom_div_vol', 'assets': [ticker], 'params': {'price_col': price_col, 'mom_win': 5, 'vol_win': 20},
                          'unique_id': f'mom_div_vol_{price_col}_5d_20d__{ticker}', 'display_name': f"mom_div_vol({price_col}, 5d, 20d)__{ticker}"})
    feature_specs.append({'type': 'bollinger_pctB', 'assets': [ticker], 'params': {'window': 20, 'price_col': price_col},
                          'unique_id': f'pctB_{price_col}_20d__{ticker}', 'display_name': f"%B({price_col}, 20d)__{ticker}"})
    feature_specs.append({'type': 'fractional_differencing', 'assets': [ticker], 'params': {'d': 0.5, 'window': 100, 'price_col': price_col},
                          'unique_id': f'frac_diff_{price_col}_d0.5_100w__{ticker}', 'display_name': f"frac_diff({price_col}, d=0.5, win=100)__{ticker}"})

# Regime & interaction
feature_specs.append({'type': 'regime_filter', 'assets': ['VIX Index'], 'params': {'threshold': 25, 'col': 'PX_LAST'},
                      'unique_id': 'REGIME_IS_HIGH_VOL', 'display_name': 'REGIME_IS_HIGH_VOL (VIX > 25)'})
feature_specs.append({'type': 'interaction', 'assets': [],
                      'params': {'feature1': 'zscore_IVOL_SIGMA_30d__AAPL US Equity', 'feature2': 'REGIME_IS_HIGH_VOL'},
                      'unique_id': 'zscore_IVOL_SIGMA_30d__AAPL US Equity_IN_HIGH_VOL',
                      'display_name': 'zscore(IVOL_SIGMA, 30d)__AAPL US Equity IN_HIGH_VOL'})

print(f"Defined {len(feature_specs)} total feature specifications.")

# =========================
# FEATURE CALC ENGINE
# =========================

print('--- Building raw feature set... ---')
feat_raw = pd.DataFrame(index=raw.index)
_series_cache = {}  # clear

for spec in feature_specs:
    feature_id = spec['unique_id']
    try:
        # --- Volatility ---
        if spec['type'] == 'ivol_term_structure':
            s_long = safe_series(first_col_containing(spec['assets'][0], spec['params']['f_long']))
            s_short = safe_series(first_col_containing(spec['assets'][0], spec['params']['f_short']))
            raw_val = s_long - s_short
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val, method='z')

        elif spec['type'] == 'ivol_skew':
            s_put = safe_series(first_col_containing(spec['assets'][0], spec['params']['put']))
            s_call = safe_series(first_col_containing(spec['assets'][0], spec['params']['call']))
            raw_val = s_put - s_call
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val, method='madz')

        elif spec['type'] == 'ivol_shock':
            ivol_s = safe_series(first_col_containing(spec['assets'][0], spec['params']['ivol_suffix']))
            dv = ivol_s.diff()
            std = dv.ewm(span=spec['params']['window'], min_periods=10).std().replace(0, np.nan)
            raw_val = (dv - dv.rolling(spec['params']['window']).mean()) / std
            feat_raw[feature_id] = raw_val

        elif spec['type'] == 'ivol_div_volume':
            ivol_s = safe_series(first_col_containing(spec['assets'][0], spec['params']['ivol_suffix']))
            vol_s = safe_series(first_col_containing(spec['assets'][0], spec['params']['vol_suffix']))
            feat_raw[feature_id] = norm_feature(ivol_s) - norm_feature(vol_s)

        # --- Derivatives Flow & Sentiment ---
        elif spec['type'] == 'put_call_ratio_ema':
            pc = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            raw_val = pc.ewm(span=spec['params']['span']).mean()
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val)

        elif spec['type'] == 'open_interest_change':
            oi = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            raw_val = returns(oi, periods=spec['params']['days'])
            clipped_val = raw_val.clip(raw_val.quantile(0.01), raw_val.quantile(0.99))
            feat_raw[feature_id] = clipped_val
            feat_raw[f"{feature_id}_z"] = norm_feature(clipped_val)

        elif spec['type'] == 'volume_zscore':
            vol = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            feat_raw[feature_id] = norm_feature(vol, win=spec['params']['window'])

        elif spec['type'] == 'smart_money_flag':
            oi = safe_series(first_col_containing(spec['assets'][0], spec['params']['oi_col']))
            ivol = safe_series(first_col_containing(spec['assets'][0], spec['params']['ivol_col']))
            oi_up = (returns(oi) > 0)
            iv_up = (returns(ivol) > 0)
            feat_raw[feature_id] = cooccur(oi_up, iv_up, window=2).astype(int)

        # --- Generic & Cross-Asset ---
        elif spec['type'] == 'generic_zscore':
            s = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            feat_raw[feature_id] = norm_feature(s, win=spec['params']['window'])

        elif spec['type'] == 'correlation':
            t1, t2, win = spec['assets'][0], spec['assets'][1], spec['params']['window']
            r1 = returns(safe_series(first_col_containing(t1, 'PX_LAST')))
            r2 = returns(safe_series(first_col_containing(t2, 'PX_LAST')))
            f, r = rolling_corr_fisher(r1, r2, win=win)
            feat_raw[feature_id] = r
            feat_raw[f"{feature_id}_z"] = zscore_rolling(f, win=60)

        elif spec['type'] == 'advanced_correlation':
            t1, t2, win = spec['assets'][0], spec['assets'][1], spec['params']['window']
            s1 = norm_feature(safe_series(first_col_containing(t1, spec['params']['col1'])))
            s2 = norm_feature(safe_series(first_col_containing(t2, spec['params']['col2'])))
            f, r = rolling_corr_fisher(s1, s2, win=win)
            feat_raw[feature_id] = r
            feat_raw[f"{feature_id}_z"] = zscore_rolling(f, win=60)

        elif spec['type'] == 'correlation_zscore':
            t1, t2 = spec['assets']
            r1 = returns(safe_series(first_col_containing(t1, 'PX_LAST')))
            r2 = returns(safe_series(first_col_containing(t2, 'PX_LAST')))
            f_20, _ = rolling_corr_fisher(r1, r2, win=20)
            feat_raw[feature_id] = zscore_rolling(f_20, win=60)

        elif spec['type'] == 'correlation_delta':
            t1, t2 = spec['assets']
            r1 = returns(safe_series(first_col_containing(t1, 'PX_LAST')))
            r2 = returns(safe_series(first_col_containing(t2, 'PX_LAST')))
            f20, _ = rolling_corr_fisher(r1, r2, win=20)
            f60, _ = rolling_corr_fisher(r1, r2, win=60)
            raw_val = f20 - f60
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val)

        elif spec['type'] == 'rolling_beta':
            t1, t2, win = spec['assets'][0], spec['assets'][1], spec['params']['window']
            r1 = returns(safe_series(first_col_containing(t1, 'PX_LAST')))
            r2 = returns(safe_series(first_col_containing(t2, 'PX_LAST')))
            raw_val = beta_rolling(r1, r2, win=win)
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val)

        # --- Macro (fixed logic) ---
        elif spec['type'] == 'macro_mpi':
            dxy_px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            ust10_yield = safe_series(first_col_containing(spec['assets'][1], 'PX_LAST'))
            leg1 = norm_feature(returns(dxy_px).rolling(3).sum())
            leg2 = norm_feature(to_bp(ust10_yield).diff().rolling(3).sum())
            feat_raw[feature_id] = leg1 + leg2

        elif spec['type'] == 'macro_fear_overdrive':
            vix_z = norm_feature(safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')))
            dxy_z = norm_feature(returns(safe_series(first_col_containing(spec['assets'][1], 'PX_LAST'))))
            spy_px = safe_series(first_col_containing(spec['assets'][2], 'PX_LAST'))
            spy_z = norm_feature(spy_px - spy_px.rolling(20).mean())
            score = (vix_z > 0.7).astype(int) + (dxy_z > 0.5).astype(int) + (spy_z < -0.5).astype(int)
            feat_raw[feature_id] = score

        elif spec['type'] == 'macro_sector_rotation':
            r_xlk = returns(safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')), periods=5)
            r_xle = returns(safe_series(first_col_containing(spec['assets'][1], 'PX_LAST')), periods=5)
            raw_val = r_xlk - r_xle
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val)

        elif spec['type'] == 'macro_yield_spread':
            s10y = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            s2y = safe_series(first_col_containing(spec['assets'][1], 'PX_LAST'))
            raw_val = to_bp(s10y) - to_bp(s2y)
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val)

        elif spec['type'] == 'macro_cpi_zscore':
            cpi = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')).dropna()
            feat_raw[feature_id] = norm_feature(cpi, win=12)

        elif spec['type'] == 'macro_injcjc_shock':
            injcjc = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')).dropna()
            change = event_delta(injcjc)
            feat_raw[feature_id] = (change > change.rolling(20).std() * 1.5).astype(int)

        elif spec['type'] == 'macro_ffa_spread':
            ffa_z = norm_feature(safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')))
            ust2_z = norm_feature(safe_series(first_col_containing(spec['assets'][1], 'PX_LAST')))
            feat_raw[feature_id] = ffa_z - ust2_z

        elif spec['type'] == 'macro_generic_mom' or spec['type'] == 'macro_generic_chg':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            change = event_delta(px)
            # FIX: actually respect 'days' param for macro_generic_mom
            if spec['type'] == 'macro_generic_mom':
                d = int(spec.get('params', {}).get('days', 1))
                feat_raw[feature_id] = change.rolling(d).sum()
            else:
                feat_raw[feature_id] = change
            feat_raw[f"{feature_id}_z"] = norm_feature(feat_raw[feature_id])

        elif spec['type'] == 'macro_lf94truu_vol_signal':
            s = safe_series(first_col_containing(spec['assets'][0], 'VOLATILITY_30D'))
            if s.dropna().empty:
                s = get_realized_vol(safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')))
            raw_val = s / s.rolling(60).mean()
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val)

        # --- Momentum & Fractal ---
        elif spec['type'] == 'mom_div_vol':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            mom5 = returns(px, 5)
            vol5 = returns(px).rolling(5).std().replace(0, np.nan)
            raw_val = mom5 / vol5
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val)

        elif spec['type'] == 'bollinger_pctB':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            ma = px.rolling(20).mean()
            std = px.rolling(20).std().replace(0, 1e-9)
            raw_val = (px - (ma - 2 * std)) / (4 * std)
            feat_raw[feature_id] = raw_val.clip(-0.5, 1.5)

        elif spec['type'] == 'fractional_differencing':
            # FIX: indentation
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            feat_raw[feature_id] = frac_diff(px, d=spec['params']['d'], window=spec['params']['window'])

        # --- Regime & Interaction ---
        elif spec['type'] == 'regime_filter':
            px = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            feat_raw[feature_id] = (px > spec['params']['threshold']).astype(int)

        elif spec['type'] == 'interaction':
            # interaction is built later after base features exist; skip here
            continue

    except Exception as e:
        print(f"Could not build feature '{feature_id}': {e}")

# Sequential & interaction features (after raw exists)
print("--- Building sequential & interaction features... ---")
# Interaction features
for spec in feature_specs:
    if spec['type'] == 'interaction':
        f1_id, f2_id = spec['params']['feature1'], spec['params']['feature2']
        if f1_id in feat_raw.columns and f2_id in feat_raw.columns:
            feat_raw[spec['unique_id']] = feat_raw[f1_id] * feat_raw[f2_id]

# Sequential examples
try:
    # VIX spike -> QQQ-SPY corr drop
    vix_z = norm_feature(safe_series(first_col_containing('VIX Index', 'PX_LAST')))
    qqq_r = returns(safe_series(first_col_containing('QQQ US Equity', 'PX_LAST')))
    spy_r = returns(safe_series(first_col_containing('SPY US Equity', 'PX_LAST')))
    corr_z = rolling_corr_fisher_z(qqq_r, spy_r)
    if all(s.notna().any() for s in [vix_z, corr_z]):
        event_A = (vix_z.shift(1) > 1.5)
        event_B = (corr_z < -1.5)
        feat_raw['SEQ_VIX_SPIKE_THEN_CORR_DROP'] = (event_A & event_B).astype(int)

    # Yield drop -> GLD vol spike
    yield_z = norm_feature(safe_series(first_col_containing('USGG10YR Index', 'PX_LAST')))
    gold_vol = get_realized_vol(safe_series(first_col_containing('GLD US Equity', 'PX_LAST')))
    gold_vol_z = norm_feature(gold_vol)
    if all(s.notna().any() for s in [yield_z, gold_vol_z]):
        event_A = (yield_z.shift(1) < -1.5)
        event_B = (gold_vol_z > 1.5)
        feat_raw['SEQ_YIELD_DROP_THEN_GOLD_VOL_SPIKE'] = (event_A & event_B).astype(int)

    # NVDA volume spike -> QQQ price rise
    nvda_vol = safe_series(first_col_containing('NVDA US Equity', 'Volume_-Realtime_VOLUME'))
    nvda_vol_z = norm_feature(nvda_vol)
    qqq_px = safe_series(first_col_containing('QQQ US Equity', 'PX_LAST'))
    qqq_px_z = norm_feature(qqq_px)
    if all(s.notna().any() for s in [nvda_vol_z, qqq_px_z]):
        event_A = (nvda_vol_z.shift(1) > 1.5)
        event_B = (qqq_px_z > 1.5)
        feat_raw['SEQ_NVDA_VOL_SPIKE_THEN_QQQ_PRICE_RISE'] = (event_A & event_B).astype(int)

    print("Successfully built sequential features.")
except Exception as e:
    print(f"Could not build sequential features: {e}")

# Diagnostics (pre-shift)
_ = coverage_activation(feat_raw)

# Final global shift to prevent leakage
feat = feat_raw.shift(1)
_series_cache = {}
print(f"Completed global shift. Final feature shape for GA: {feat.shape}")

# =========================
# PRIMITIVE SIGNALS
# =========================

print('--- Defining Primitive Signals ---')
primitive_signals = []
signal_series = {}
signal_id_counter = 0
for feature_id in feat.columns:
    s = feat[feature_id].replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        continue

    is_boolean_like = set(s.unique()).issubset({0, 1, True, False})
    if is_boolean_like:
        if s.std() == 0:
            continue
        sig_id = f"SIG_{signal_id_counter}"; signal_id_counter += 1
        primitive_signals.append({'signal_id': sig_id, 'feature_id': feature_id, 'condition_type': 'boolean', 'operator': '==', 'value': True})
        signal_series[sig_id] = (s == True)
        continue

    if s.std() == 0:
        continue

    # Percentile signals
    for op, val in [('>', 0.8), ('<', 0.2)]:
        sig_id = f"SIG_{signal_id_counter}"; signal_id_counter += 1
        primitive_signals.append({'signal_id': sig_id, 'feature_id': feature_id, 'condition_type': 'percentile', 'operator': op, 'value': val})
        signal_series[sig_id] = s.rank(pct=True).apply(lambda x, v=val, o=op: x > v if o == '>' else x < v)

    # Z-score signals for non-z features
    if not ('_z' in feature_id or 'zscore' in feature_id):
        rolling_std = s.rolling(60).std()
        valid_std_mask = rolling_std > 1e-9
        if not valid_std_mask.any():
            continue
        z = pd.Series(np.nan, index=s.index)
        z[valid_std_mask] = (s - s.rolling(60).mean())[valid_std_mask] / rolling_std[valid_std_mask]
        for op, val in [('>', 1.5), ('<', -1.5)]:
            sig_id = f"SIG_{signal_id_counter}"; signal_id_counter += 1
            primitive_signals.append({'signal_id': sig_id, 'feature_id': feature_id, 'condition_type': 'z_score', 'operator': op, 'value': val})
            signal_series[sig_id] = z.apply(lambda x, v=val, o=op: x > v if o == '>' else x < v)

print(f"Defined {len(primitive_signals)} primitive signals.")

# =========================
# TARGETS (FORWARD RETURNS)
# =========================

# Prepare forward returns (AVOID collision with helper `returns()`)
price_cols_for_returns = [first_col_containing(t, 'PX_LAST') for t in TRADABLE_TICKERS if first_col_containing(t, 'PX_LAST')]
prices = raw[price_cols_for_returns].copy()
fwd_returns = {h: prices.pct_change(h).shift(-h) for h in [1, 3, 5, 10, 21]}

# =========================
# GA HELPERS
# =========================

def get_setup_dna(setup):
    return tuple(sorted([s['signal_id'] for s in setup['signal_definitions']]))

def crossover(parent1, parent2):
    child_signals = [random.choice(parent1['signal_definitions']), random.choice(parent2['signal_definitions'])]
    if len(parent1['signal_definitions']) > 1 and len(parent2['signal_definitions']) > 1:
        child_signals.append(random.choice(parent1['signal_definitions']))
        child_signals.append(random.choice(parent2['signal_definitions']))
    child_signals = list({s['signal_id']: s for s in child_signals}.values())
    if len(child_signals) > max(SETUP_LENGTHS_TO_EXPLORE):
        child_signals = random.sample(child_signals, max(SETUP_LENGTHS_TO_EXPLORE))
    return {'id': 'child', 'signal_definitions': child_signals}

def mutate(setup, all_signal_ids, mutation_rate):
    if random.random() < mutation_rate:
        idx_to_mutate = random.randint(0, len(setup['signal_definitions']) - 1)
        new_signal_id = random.choice(all_signal_ids)
        new_signal_def = next(p for p in primitive_signals if p['signal_id'] == new_signal_id)
        if new_signal_def['signal_id'] not in [s['signal_id'] for s in setup['signal_definitions']]:
            setup['signal_definitions'][idx_to_mutate] = new_signal_def
    return setup

def non_dominated_sort(population):
    for ind1 in population:
        ind1['domination_count'] = 0
        ind1['dominated_solutions'] = []
        for ind2 in population:
            if ind1 is ind2:
                continue
            is_dominant = (
                (ind1['objectives'][0] >= ind2['objectives'][0] and
                 ind1['objectives'][1] >= ind2['objectives'][1] and
                 ind1['objectives'][2] >= ind2['objectives'][2]) and
                (ind1['objectives'][0] > ind2['objectives'][0] or
                 ind1['objectives'][1] > ind2['objectives'][1] or
                 ind1['objectives'][2] > ind2['objectives'][2])
            )
            if is_dominant:
                ind1['dominated_solutions'].append(ind2)
            elif (
                (ind2['objectives'][0] >= ind1['objectives'][0] and
                 ind2['objectives'][1] >= ind1['objectives'][1] and
                 ind2['objectives'][2] >= ind1['objectives'][2]) and
                (ind2['objectives'][0] > ind1['objectives'][0] or
                 ind2['objectives'][1] > ind1['objectives'][1] or
                 ind2['objectives'][2] > ind1['objectives'][2])
            ):
                ind1['domination_count'] += 1

    fronts = []
    rank = 1
    front1 = [ind for ind in population if ind['domination_count'] == 0]
    for ind in front1:
        ind['rank'] = rank
    current_front = front1
    while current_front:
        fronts.append(current_front)
        next_front = []
        for ind1 in current_front:
            for ind2 in ind1['dominated_solutions']:
                ind2['domination_count'] -= 1
                if ind2['domination_count'] == 0:
                    ind2['rank'] = rank + 1
                    next_front.append(ind2)
        rank += 1
        current_front = next_front
    sorted_population = [ind for front in fronts for ind in front]
    return sorted_population

def calculate_crowding_distance(front):
    if not front:
        return
    num_objectives = len(front[0]['objectives'])
    for ind in front:
        ind['crowding_distance'] = 0
    for i in range(num_objectives):
        front.sort(key=lambda x: x['objectives'][i])
        front[0]['crowding_distance'] = float('inf')
        front[-1]['crowding_distance'] = float('inf')
        obj_min = front[0]['objectives'][i]
        obj_max = front[-1]['objectives'][i]
        if obj_max == obj_min:
            continue
        for j in range(1, len(front) - 1):
            front[j]['crowding_distance'] += (front[j + 1]['objectives'][i] - front[j - 1]['objectives'][i]) / (obj_max - obj_min)

def selection_operator(population, k=2):
    tournament_contenders = random.sample(population, k)
    contender1, contender2 = tournament_contenders[0], tournament_contenders[1]
    if contender1['rank'] < contender2['rank']:
        return contender1
    elif contender2['rank'] < contender1['rank']:
        return contender2
    else:
        if contender1['crowding_distance'] > contender2['crowding_distance']:
            return contender1
        else:
            return contender2

def generate_english_description(setup_id, signal_defs, feature_specs_list):
    clauses = []
    for s_def in signal_defs:
        feat_name = next((f_spec['display_name'] for f_spec in feature_specs_list if f_spec['unique_id'] == s_def['feature_id']), s_def.get('feature_id', 'unknown_feature'))
        if s_def['condition_type'] == 'boolean':
            clauses.append(f"{feat_name} is true")
        elif s_def['condition_type'] == 'percentile':
            level = "is very high" if s_def['operator'] == '>' else "is very low"
            clauses.append(f"{feat_name} {level}")
        else:
            level = "surges unexpectedly" if s_def['operator'] == '>' else "drops sharply"
            clauses.append(f"{feat_name} {level}")
    description = f"When {clauses[0]}"
    if len(clauses) > 1:
        description += f" and {' and '.join(clauses[1:])}"
    direction_score = sum(1 if s['operator'] == '>' else -1 for s in signal_defs)
    bias = 'a bullish' if direction_score > 0 else 'a bearish' if direction_score < 0 else 'an uncertain'
    description += f", it may indicate {bias} outlook."
    return {'setup_id': setup_id, 'description': description, 'explained_description': "DEPRECATED"}

def evaluate_one_setup(setup, returns_dict):
    """Evaluates a single setup and returns objectives and trigger dates."""
    sid, signal_defs = setup['id'], setup['signal_definitions']
    CAP_VALUE = 100.0  # cap for risk metrics

    try:
        mask = functools.reduce(lambda a, b: a & b, [signal_series[s['signal_id']] for s in signal_defs])
        dates = mask[mask].index
    except (KeyError, TypeError):
        return {'id': sid, 'signal_definitions': signal_defs, 'objectives': (-99, -99, 0), 'metrics_by_ticker': {}, 'trigger_dates': pd.Index([])}

    support = len(dates)
    if support < MIN_INITIAL_SUPPORT_FILTER:
        return {'id': sid, 'signal_definitions': signal_defs, 'objectives': (-99, -99, support), 'metrics_by_ticker': {}, 'trigger_dates': pd.Index([])}

    # Determine direction
    direction_score = sum(1 if s['operator'] == '>' else -1 for s in signal_defs if s['condition_type'] != 'boolean')
    if direction_score == 0 and any(s['condition_type'] != 'boolean' for s in signal_defs):
        return {'id': sid, 'signal_definitions': signal_defs, 'objectives': (-99, -99, support), 'metrics_by_ticker': {}, 'trigger_dates': dates}
    elif all(s['condition_type'] == 'boolean' for s in signal_defs):
        direction_score = 1
    entry_direction = 'long' if direction_score > 0 else 'short'

    all_sortinos, all_calmars, metrics_by_ticker = [], [], {}
    perf_horizon = 10

    for tk_col in price_cols_for_returns:
        tk_symbol = next((ticker for ticker in TRADABLE_TICKERS if tk_col.startswith(ticker)), "Unknown")
        r_ticker = returns_dict[perf_horizon][tk_col].reindex(dates).dropna()
        if entry_direction == 'short':
            r_ticker = -r_ticker
        if len(r_ticker) >= 5 and r_ticker.std() > 1e-9:
            sortino = calculate_sortino_ratio(r_ticker)
            calmar = calculate_calmar_ratio(r_ticker)
            all_sortinos.append(sortino)
            all_calmars.append(calmar)
            metrics_by_ticker[tk_symbol] = {'sortino': sortino, 'calmar': calmar}

    median_sortino = np.nan_to_num(np.median(all_sortinos) if all_sortinos else -99, nan=-99.0, posinf=CAP_VALUE, neginf=-999.0)
    median_calmar = np.nan_to_num(np.median(all_calmars) if all_calmars else -99, nan=-99.0, posinf=CAP_VALUE, neginf=-999.0)

    median_sortino = min(median_sortino, CAP_VALUE)
    median_calmar = min(median_calmar, CAP_VALUE)

    return {
        'id': sid,
        'signal_definitions': signal_defs,
        'objectives': (median_sortino, median_calmar, support),
        'metrics_by_ticker': metrics_by_ticker,
        'entry_direction': entry_direction,
        'trigger_dates': dates,
        'first_trigger_date': dates.min() if not dates.empty else pd.NaT,
        'last_trigger_date': dates.max() if not dates.empty else pd.NaT
    }

# =========================
# INITIAL POPULATION
# =========================

print('\n--- GENETIC ALGORITHM: Creating Initial Population (Generation 0) ---')
all_signal_ids = [s['signal_id'] for s in primitive_signals]
current_population = []
setup_id_counter = 0
existing_dna = set()
perf_horizon = 10

def is_setup_viable(signal_defs, min_trades=5):
    """Pre-evaluation to ensure viable setups."""
    try:
        mask = functools.reduce(lambda a, b: a & b, [signal_series[s['signal_id']] for s in signal_defs])
        dates = mask[mask].index
        if len(dates) < MIN_INITIAL_SUPPORT_FILTER:
            return False
        for tk_col in price_cols_for_returns:
            r_ticker = fwd_returns[perf_horizon][tk_col].reindex(dates).dropna()
            if len(r_ticker) >= min_trades:
                return True
        return False
    except (KeyError, TypeError):
        return False

print("Seeding with pre-validated single and pair setups...")
num_to_create = int(POPULATION_SIZE * 0.8)
max_attempts = len(primitive_signals) * 5
attempts = 0

for p_signal in primitive_signals:
    if len(current_population) >= num_to_create:
        break
    if is_setup_viable([p_signal]):
        setup = {'id': f'S{setup_id_counter:04d}', 'signal_definitions': [p_signal]}
        dna = get_setup_dna(setup)
        if dna not in existing_dna:
            current_population.append(setup)
            existing_dna.add(dna)
            setup_id_counter += 1

while len(current_population) < num_to_create and attempts < max_attempts:
    attempts += 1
    p_signal_1, p_signal_2 = random.choice(primitive_signals), random.choice(primitive_signals)
    if p_signal_1['signal_id'] == p_signal_2['signal_id']:
        continue
    sig_defs = [p_signal_1, p_signal_2]
    if is_setup_viable(sig_defs):
        setup = {'id': f'S{setup_id_counter:04d}', 'signal_definitions': sig_defs}
        dna = get_setup_dna(setup)
        if dna not in existing_dna:
            current_population.append(setup)
            existing_dna.add(dna)
            setup_id_counter += 1

print(f"  - Created {len(current_population)} pre-validated setups.")

print("Filling remainder of population with pre-validated random setups...")
max_attempts, attempts = POPULATION_SIZE * 100, 0
while len(current_population) < POPULATION_SIZE and attempts < max_attempts:
    attempts += 1
    k = random.choice(SETUP_LENGTHS_TO_EXPLORE)
    sig_id_list = random.sample(all_signal_ids, k)
    sig_defs = [p for p in primitive_signals if p['signal_id'] in sig_id_list]
    dna = get_setup_dna({'signal_definitions': sig_defs})
    if dna in existing_dna:
        continue
    if is_setup_viable(sig_defs):
        temp_setup = {'id': f'S{setup_id_counter:04d}', 'signal_definitions': sig_defs}
        current_population.append(temp_setup)
        existing_dna.add(dna)
        setup_id_counter += 1

if attempts >= max_attempts:
    print(f"Warning: Population filling stopped after {max_attempts} attempts.")
if not current_population:
    raise SystemExit("FATAL: Could not create any viable setups.")
print(f"Created initial population of {len(current_population)} guaranteed viable setups.")

# =========================
# MAIN EVOLUTION LOOP
# =========================

hall_of_fame = []
latest_candidate_pool = []  # widened pool snapshot

for generation in range(NUM_GENERATIONS):
    print(f"\n--- Evaluating Generation {generation + 1}/{NUM_GENERATIONS} ---")

    # Evaluate current population
    evaluated_population = Parallel(n_jobs=-1)(
        delayed(evaluate_one_setup)(setup, fwd_returns) for setup in current_population
    )
    combined_population = evaluated_population

    # Children after gen 0
    if generation > 0:
        children = []
        ranked_population = non_dominated_sort(evaluated_population)
        if ranked_population:
            # crowding distance per front
            front_num = 1
            while True:
                current_front = [ind for ind in ranked_population if ind.get('rank') == front_num]
                if not current_front:
                    break
                calculate_crowding_distance(current_front)
                front_num += 1
            # fill children
            while len(children) < POPULATION_SIZE:
                parent1 = selection_operator(ranked_population)
                parent2 = selection_operator(ranked_population)
                child = crossover(parent1, parent2)
                child = mutate(child, all_signal_ids, MUTATION_RATE)
                child['id'] = f'S{setup_id_counter:04d}'; setup_id_counter += 1
                children.append(child)
            evaluated_children = Parallel(n_jobs=-1)(
                delayed(evaluate_one_setup)(setup, fwd_returns) for setup in children
            )
            combined_population += evaluated_children

    # Phenotype de-duplication
    def _pheno_hash(dates):
        if dates is None:
            return None
        try:
            if not isinstance(dates, pd.DatetimeIndex):
                dates = pd.to_datetime(pd.Index(dates))
            if dates.empty:
                return None
            return hash(dates.asi8.tobytes())
        except Exception:
            return None

    if generation > 0 and combined_population:
        pheno_map = {}
        for ind in combined_population:
            key = _pheno_hash(ind.get('trigger_dates'))
            if key is not None:
                pheno_map[key] = ind
        unique_phenotype_population = list(pheno_map.values())
    else:
        unique_phenotype_population = combined_population

    if not unique_phenotype_population:
        print("Population extinct. Stopping.")
        break



    # ----- Widened pool -----
    ranked_all = non_dominated_sort(unique_phenotype_population)

    candidate_pool = [
        ind for ind in ranked_all
        if ind.get('rank', 999) <= KEEP_TOP_RANKS
        and ind['objectives'][0] >= MIN_SORTINO
        and ind['objectives'][2] >= MIN_SUPPORT
]

    # phenotype de-dupe
    _tmp = {}
    for ind in candidate_pool:
        key = _pheno_hash(ind.get('trigger_dates')) or ('NA', ind.get('id'))
        _tmp[key] = ind
    candidate_pool = list(_tmp.values())

# sort and cap using the global MAX_RESULTS
    candidate_pool.sort(
        key=lambda x: (x['objectives'][0], x['objectives'][1], x['objectives'][2]),
        reverse=True
)
    candidate_pool = candidate_pool[:MAX_RESULTS]
    latest_candidate_pool = candidate_pool


    # Survivor selection
    sorted_population = non_dominated_sort(unique_phenotype_population)
    next_generation_population = []
    front_num = 1
    while len(next_generation_population) < POPULATION_SIZE:
        current_front = [ind for ind in sorted_population if ind['rank'] == front_num]
        if not current_front:
            break
        calculate_crowding_distance(current_front)
        if len(next_generation_population) + len(current_front) <= POPULATION_SIZE:
            next_generation_population.extend(current_front)
        else:
            current_front.sort(key=lambda x: x['crowding_distance'], reverse=True)
            num_needed = POPULATION_SIZE - len(next_generation_population)
            next_generation_population.extend(current_front[:num_needed])
        front_num += 1

    if not next_generation_population:
        print("Warning: Could not form next generation. Stopping.")
        break

    current_population = next_generation_population

    # Update Hall of Fame
    current_best_front = [ind for ind in sorted_population if ind['rank'] == 1]
    if current_best_front:
        hall_of_fame_candidates = non_dominated_sort(hall_of_fame + current_best_front)
        hof_pheno_dict = {
            _pheno_hash(ind.get('trigger_dates')): ind
            for ind in hall_of_fame_candidates
            if ind.get('trigger_dates') is not None and not pd.Index(ind['trigger_dates']).empty
        }
        hall_of_fame = [ind for ind in hof_pheno_dict.values() if ind.get('rank') == 1]

    if hall_of_fame:
        hall_of_fame.sort(key=lambda x: x['objectives'][0], reverse=True)
        best_of_gen = hall_of_fame[0]
        print(f"Generation {generation + 1} Complete. Unique Phenotypes: {len(unique_phenotype_population)}. "
              f"Hall of Fame: {len(hall_of_fame)}. Best: (S:{best_of_gen['objectives'][0]:.2f}, "
              f"C:{best_of_gen['objectives'][1]:.2f}, Sup:{best_of_gen['objectives'][2]})")
    else:
        print(f"Generation {generation + 1} Complete. No valid solutions in Hall of Fame.")


# =========================
# FINAL REPORTING
# =========================

print("\n--- Genetic Algorithm Complete. Generating Final Report from Hall of Fame ---")

if not hall_of_fame:
    print("Discovery complete. The Hall of Fame is empty; no valid setups were found.")
    final_summary_df = pd.DataFrame()
    trade_ledger_df = pd.DataFrame()
else:
    final_pareto_front_df = pd.DataFrame(hall_of_fame).drop_duplicates(subset=['id'], keep='first')
    final_pareto_front = final_pareto_front_df.to_dict('records')

    # ---------- dataset-aware recency + optional auto-relax ----------
    # Use the dataset's end date as the recency anchor (not wall-clock today)
    DATA_END = (raw.index.max() if isinstance(raw.index, pd.DatetimeIndex) and len(raw.index) > 0
                else pd.Timestamp.today().normalize())

    def _in_date_filters(ind, data_end=DATA_END, recency_days=RESULTS_LAST_TRIGGER_DAYS, date_range=RESULTS_DATE_RANGE):
        """
        Keep a setup if it passes recency/date-range filters.
        - Recency is measured against the *data's* end date, not wall-clock today.
        - date_range uses RESULTS_DATE_RANGE if provided.
        """
        if ind is None:
            return False

        ltd = pd.to_datetime(ind.get('last_trigger_date')) if ind.get('last_trigger_date') is not None else pd.NaT

        # Date range filter (unchanged)
        if date_range:
            start = pd.to_datetime(date_range.get('start')) if date_range.get('start') else None
            end = pd.to_datetime(date_range.get('end')) if date_range.get('end') else None
            if start and (pd.isna(ltd) or ltd < start):
                return False
            if end and (pd.isna(ltd) or ltd > end):
                return False

        # Recency filter vs data_end
        if recency_days is not None:
            cutoff = pd.to_datetime(data_end).normalize() - pd.Timedelta(days=int(recency_days))
            if pd.isna(ltd) or ltd < cutoff:
                return False

        return True

    # Diagnostics pre-filter
    _pre_df = pd.DataFrame(final_pareto_front)
    if not _pre_df.empty:
        _ltd = pd.to_datetime(_pre_df['last_trigger_date'], errors='coerce')
        print(f"[Diagnostics] DATA_END (recency anchor): {DATA_END.date() if not pd.isna(DATA_END) else DATA_END}")
        print(f"[Diagnostics] HoF pre-filter count: {len(_pre_df)}; ltd span: "
              f"{_ltd.min().date() if _ltd.notna().any() else 'NaT'} -> "
              f"{_ltd.max().date() if _ltd.notna().any() else 'NaT'}")
        if RESULTS_LAST_TRIGGER_DAYS is not None:
            _cut = pd.to_datetime(DATA_END).normalize() - pd.Timedelta(days=int(RESULTS_LAST_TRIGGER_DAYS))
            print(f"[Diagnostics] Recency cutoff: {_cut.date()} (window={RESULTS_LAST_TRIGGER_DAYS}d)")

    # First-pass filter (dataset-aware)
    final_pareto_front = [s for s in final_pareto_front if _in_date_filters(s)]
    latest_candidate_pool = [s for s in latest_candidate_pool if _in_date_filters(s)]

    # Auto-relax if too few remain (do NOT mutate your global config)
    if len(final_pareto_front) < 10 and RESULTS_LAST_TRIGGER_DAYS is not None:
        relaxed_days = max(int(RESULTS_LAST_TRIGGER_DAYS) * 3, 90)
        print(f"[Auto-Relax] Only {len(final_pareto_front)} setups after recency={RESULTS_LAST_TRIGGER_DAYS}d. "
              f"Re-evaluating with {relaxed_days}d.")
        # Rebuild from HoF and re-apply with relaxed window
        _hof_unique = pd.DataFrame(hall_of_fame).drop_duplicates(subset=['id'], keep='first').to_dict('records')
        final_pareto_front = [s for s in _hof_unique if _in_date_filters(s, recency_days=relaxed_days)]
        latest_candidate_pool = [s for s in latest_candidate_pool if _in_date_filters(s, recency_days=relaxed_days)]

    print(f"Final unique Pareto Front contains {len(final_pareto_front)} non-dominated solutions.")

    # ---------- merge HoF with widened pool (phenotype-dedupe so breadth knobs matter) ----------
    def _pheno_hash(dates):
        if dates is None:
            return None
        try:
            idx = pd.to_datetime(pd.Index(dates))
            if idx.empty:
                return None
            return hash(idx.asi8.tobytes())
        except Exception:
            return None

    combined_final_pool = (final_pareto_front or []) + (latest_candidate_pool or [])

    # Phenotype de-dup: keep the better solution by (Sortino, Calmar, Support)
    _pheno_map = {}
    for ind in combined_final_pool:
        key = _pheno_hash(ind.get('trigger_dates')) or ('NA', ind.get('id'))
        if key not in _pheno_map:
            _pheno_map[key] = ind
        else:
            a, b = _pheno_map[key], ind
            if (b['objectives'][0], b['objectives'][1], b['objectives'][2]) > (a['objectives'][0], a['objectives'][1], a['objectives'][2]):
                _pheno_map[key] = b

    final_pareto_front = list(_pheno_map.values())
    print(f"Final unique solutions after merging HoF+Pool (phenotype-dedupe): {len(final_pareto_front)}")

    # ---------- audit keep/drop by recency for transparency ----------
    def _recency_cut_reason(ind, data_end):
        if ind is None:
            return 'none'
        ltd = pd.to_datetime(ind.get('last_trigger_date'), errors='coerce')
        if RESULTS_LAST_TRIGGER_DAYS is None:
            return 'kept_no_recency'
        cutoff = pd.to_datetime(data_end).normalize() - pd.Timedelta(days=int(RESULTS_LAST_TRIGGER_DAYS))
        return 'dropped_recency' if (pd.isna(ltd) or ltd < cutoff) else 'kept'

    if hall_of_fame:
        _audit_df = pd.DataFrame(hall_of_fame)
        _audit_df['reason'] = _audit_df.apply(lambda r: _recency_cut_reason(r, DATA_END), axis=1)
        print("[Audit] HoF recency keep/drop counts:")
        print(_audit_df['reason'].value_counts(dropna=False).to_string())

    def _resolve_entry_ivol(tk_symbol, d):
        """
        Try multiple implied-vol columns; if none, fall back to realized vol or VIX.
        Returns a float in [0.05, 3.0].
        """
        candidates = [
            '30_Day_Call_Implied_Volatility',
            'CALL_IMP_VOL_30D',
            'IVOL_SIGMA',
            '1st_Month_Call_Imp_Vol_40_Delta',
            '1st_Month_Put_Imp_Vol_50_Delta',
        ]
        for c in candidates:
            cn = first_col_containing(tk_symbol, c)
            if cn and cn in raw.columns:
                v = raw[cn].get(d, np.nan)
                if pd.notna(v) and float(v) > 0:
                    v = float(v) / 100.0 if float(v) > 1.5 else float(v)
                    return float(np.clip(v, 0.05, 3.0))

        px_col = first_col_containing(tk_symbol, 'PX_LAST')
        if px_col and px_col in raw.columns:
            rv = get_realized_vol(raw[px_col], win=20)
            v = rv.get(d, np.nan)
            if pd.notna(v) and float(v) > 0:
                return float(np.clip(float(v), 0.05, 3.0))

        vix_col = first_col_containing('VIX Index', 'PX_LAST')
        if vix_col and vix_col in raw.columns:
            v = raw[vix_col].get(d, np.nan)
            if pd.notna(v) and float(v) > 0:
                v = float(v) / 100.0
                return float(np.clip(v, 0.05, 3.0))

        return 0.20  # conservative default

    print(f"\n--- Step 1: Building Trade Ledger for all {len(final_pareto_front)} solutions ---")
    print(f"\n--- Step 1: Building Trade Ledger for all {len(final_pareto_front)} solutions ---")

    # Build trade ledger (ATM heuristic)
    all_trade_ledger_rows = []
    for setup_solution in final_pareto_front:
        setup_id = setup_solution['id']
        dates = pd.Index(setup_solution['trigger_dates'])
        if dates.empty:
            continue
        for tk_symbol in TRADABLE_TICKERS:
            tk_col = first_col_containing(tk_symbol, 'PX_LAST')
            if not tk_col:
                continue
            entry_px_series = raw[tk_col].reindex(dates)
            for d in dates:
                entry_px = entry_px_series.loc[d]
                ivol = _resolve_entry_ivol(tk_symbol, d)   # <— use robust resolver
                for h_opt in OPTION_SIM_HORIZONS_DAYS:
                    exit_date = d + pd.Timedelta(days=h_opt)
                    future_px_series = raw.loc[raw.index >= exit_date, tk_col]
                    final_exit_px = future_px_series.iloc[0] if not future_px_series.empty else np.nan
                    pnl_detail = simulate_option_pnl_detailed(entry_px, final_exit_px, ivol, h_opt, setup_solution['entry_direction'])
                    if pnl_detail['skipped_reason'] == 'None':
                        all_trade_ledger_rows.append({'setup_id': setup_id, 'trigger_date': d, 'target_ticker': tk_symbol, 'horizon_days': h_opt, **pnl_detail})


    trade_ledger_df = pd.DataFrame(all_trade_ledger_rows)

    if trade_ledger_df.empty:
        print("\n--- WARNING: No valid trades could be simulated for any of the final setups. ---")
        print("This is likely due to trigger dates being too close to the end of the dataset.")
        print("Final reports will be empty.")
        final_summary_df = pd.DataFrame()
    else:
        setups_with_ledger_data = set(trade_ledger_df['setup_id'].unique())
        print(f"\n--- Step 2: Found {len(setups_with_ledger_data)} setups with valid trade data. Building final summary. ---")

        summary_rows = []
        all_description_records = []

        # Forward-returns dict shim
        RETURNS_DICT = globals().get('fwd_returns', globals().get('returns', None))
        if RETURNS_DICT is None:
            raise NameError("Forward-returns dict not found. Create fwd_returns earlier.")

        for setup_solution in final_pareto_front:
            setup_id = setup_solution['id']
            if setup_id not in setups_with_ledger_data:
                continue

            setup_def = setup_solution['signal_definitions']
            all_description_records.append(generate_english_description(setup_id, setup_def, feature_specs))
            dates = pd.Index(setup_solution['trigger_dates'])

            best_ticker, best_sortino = "N/A", -999
            if setup_solution.get('metrics_by_ticker'):
                for ticker, metrics in setup_solution['metrics_by_ticker'].items():
                    if metrics.get('sortino', -999) > best_sortino:
                        best_sortino, best_ticker = metrics['sortino'], ticker

            # Recency Sharpe over last RECENCY_WINDOW triggers
            recency_sharpe = np.nan
            if len(dates) >= RECENCY_WINDOW:
                recent_dates = dates[-RECENCY_WINDOW:]
                h = 10
                recent_sharpes = []
                for tk_symbol in TRADABLE_TICKERS:
                    price_col_name = first_col_containing(tk_symbol, 'PX_LAST')
                    if not price_col_name:
                        continue
                    r_recent = RETURNS_DICT[h][price_col_name].reindex(recent_dates).dropna()
                    if setup_solution['entry_direction'] == 'short':
                        r_recent = -r_recent
                    if r_recent.std() > 1e-9 and len(r_recent) > 2:
                        recent_sharpes.append((r_recent.mean() / r_recent.std()) * np.sqrt(252 / h))
                if recent_sharpes:
                    recency_sharpe = float(np.nanmedian(recent_sharpes))

            # Holdout OOS Sharpe (if holdout date is set)
            oos_sharpe = np.nan
            if HOLDOUT_START_DATE:
                holdout_date = pd.to_datetime(HOLDOUT_START_DATE)
                test_dates = dates[dates >= holdout_date]
                if len(test_dates) >= 2:
                    h = 10
                    oos_sharpes = []
                    for tk_symbol in TRADABLE_TICKERS:
                        price_col_name = first_col_containing(tk_symbol, 'PX_LAST')
                        if not price_col_name:
                            continue
                        r_oos = RETURNS_DICT[h][price_col_name].reindex(test_dates).dropna()
                        if setup_solution['entry_direction'] == 'short':
                            r_oos = -r_oos
                        if r_oos.std() > 1e-9 and len(r_oos) > 2:
                            oos_sharpes.append((r_oos.mean() / r_oos.std()) * np.sqrt(252 / h))
                    if oos_sharpes:
                        oos_sharpe = float(np.nanmedian(oos_sharpes))

            summary_rows.append({
                'setup_id': setup_id,
                'rank': setup_solution.get('rank'),
                'best_performing_ticker': best_ticker,
                'obj_sortino': setup_solution['objectives'][0],
                'obj_calmar': setup_solution['objectives'][1],
                'obj_support': setup_solution['objectives'][2],
                'entry_direction': setup_solution['entry_direction'],
                'first_trigger_date': setup_solution.get('first_trigger_date'),
                'last_trigger_date': setup_solution.get('last_trigger_date'),
                'recency_sharpe': recency_sharpe,
                'oos_sharpe': oos_sharpe,
            })

        summary_df = pd.DataFrame(summary_rows)
        description_df = pd.DataFrame(all_description_records).drop_duplicates(subset=['setup_id'])

        # Add average option PnL across horizons
        for h_opt in OPTION_SIM_HORIZONS_DAYS:
            pnl_dollars_map = trade_ledger_df[trade_ledger_df['horizon_days'] == h_opt].groupby('setup_id')['pnl_dollars'].mean()
            pnl_pct_map = trade_ledger_df[trade_ledger_df['horizon_days'] == h_opt].groupby('setup_id')['pnl_pct'].mean()
            summary_df[f'avg_option_pnl_dollars_{h_opt}d'] = summary_df['setup_id'].map(pnl_dollars_map)
            summary_df[f'avg_option_pnl_pct_{h_opt}d'] = summary_df['setup_id'].map(pnl_pct_map)

        numeric_cols = summary_df.select_dtypes(include=np.number).columns
        summary_df[numeric_cols] = summary_df[numeric_cols].round(4)
        trade_ledger_df = trade_ledger_df.round({'pnl_pct': 4, 'pnl_dollars': 2})

        final_summary_df = pd.merge(summary_df, description_df[['setup_id', 'description']], on='setup_id', how='left')
        final_summary_df.sort_values(by=['obj_sortino', 'obj_calmar', 'obj_support'], ascending=[False, False, False], inplace=True)
        # ===== Guarantee the ledger covers every setup in the SUMMARY =====
        def _build_ledger_for_setups(setup_records):
            rows = []
            for setup_solution in setup_records:
                setup_id = setup_solution['id']
                dates = pd.Index(setup_solution['trigger_dates'])
                if dates.empty:
                    continue
                for tk_symbol in TRADABLE_TICKERS:
                    tk_col = first_col_containing(tk_symbol, 'PX_LAST')
                    if not tk_col:
                        continue
                    ivol_col = (first_col_containing(tk_symbol, '30_Day_Call_Implied_Volatility') or
                                first_col_containing(tk_symbol, 'IVOL_SIGMA'))
                    ivol_series = raw[ivol_col].reindex(dates) if ivol_col and ivol_col in raw.columns else pd.Series(np.nan, index=dates)
                    entry_px_series = raw[tk_col].reindex(dates)
                    for d in dates:
                        entry_px = entry_px_series.loc[d]
                        ivol = ivol_series.loc[d] if not ivol_series.empty and pd.notna(d) and d in ivol_series.index else np.nan
                        for h_opt in OPTION_SIM_HORIZONS_DAYS:
                            exit_date = d + pd.Timedelta(days=h_opt)
                            future_px_series = raw.loc[raw.index >= exit_date, tk_col]
                            final_exit_px = future_px_series.iloc[0] if not future_px_series.empty else np.nan
                            pnl_detail = simulate_option_pnl_detailed(entry_px, final_exit_px, ivol, h_opt, setup_solution['entry_direction'])
                            if pnl_detail['skipped_reason'] == 'None':
                                rows.append({'setup_id': setup_id, 'trigger_date': d, 'target_ticker': tk_symbol, 'horizon_days': h_opt, **pnl_detail})
            return pd.DataFrame(rows)

        # Build missing ledger entries for any SUMMARY setup_ids not present in the ledger yet
        summary_setup_ids = set(final_summary_df['setup_id'].unique())
        have_setup_ids = set(trade_ledger_df['setup_id'].unique())
        missing_ids = summary_setup_ids - have_setup_ids
        if missing_ids:
            id2setup = {s['id']: s for s in final_pareto_front}
            missing_setups = [id2setup[i] for i in missing_ids if i in id2setup]
            if missing_setups:
                print(f"[Ledger Backfill] {len(missing_setups)} summary setups missing from ledger. Backfilling…")
                backfill_df = _build_ledger_for_setups(missing_setups)
                trade_ledger_df = pd.concat([trade_ledger_df, backfill_df], ignore_index=True)

        # After backfill, keep ledger strictly to SUMMARY setups so files align 1:1
        trade_ledger_df = trade_ledger_df[trade_ledger_df['setup_id'].isin(summary_setup_ids)].copy()

        # ===== Avg option PnL over the LAST 10 TRIGGERS for each horizon =====
        # Aggregate to per-setup per-date per-horizon (mean across tickers for that date)
        _daily = (trade_ledger_df
                  .groupby(['setup_id', 'horizon_days', 'trigger_date'])[['pnl_dollars', 'pnl_pct']]
                  .mean()
                  .reset_index())

        # Within each (setup, horizon), take the last 10 dates chronologically
        _daily = _daily.sort_values(['setup_id', 'horizon_days', 'trigger_date'])
        _last10 = (_daily
                   .groupby(['setup_id', 'horizon_days'], as_index=False)
                   .apply(lambda g: g.tail(10))
                   .reset_index(drop=True))

        # Compute averages over those last 10 triggers
        _last10_avg = (_last10
                       .groupby(['setup_id', 'horizon_days'])[['pnl_dollars', 'pnl_pct']]
                       .mean()
                       .reset_index())

        # Map into wide columns on summary_df, one pair per horizon
        for h_opt in OPTION_SIM_HORIZONS_DAYS:
            _m_d = _last10_avg[_last10_avg['horizon_days'] == h_opt].set_index('setup_id')['pnl_dollars']
            _m_p = _last10_avg[_last10_avg['horizon_days'] == h_opt].set_index('setup_id')['pnl_pct']
            summary_df[f'avg_option_pnl_dollars_last10_{h_opt}d'] = summary_df['setup_id'].map(_m_d)
            summary_df[f'avg_option_pnl_pct_last10_{h_opt}d'] = summary_df['setup_id'].map(_m_p)

        # Rebuild final_summary_df to include the new columns
        final_summary_df = pd.merge(summary_df, description_df[['setup_id', 'description']], on='setup_id', how='left')
        final_summary_df.sort_values(by=['obj_sortino', 'obj_calmar', 'obj_support'], ascending=[False, False, False], inplace=True)


# =========================
# SAVE OUTPUTS
# =========================

if 'final_summary_df' in globals() and not final_summary_df.empty:
    print('\n--- Generating Final Output Files ---')
    summary_path = os.path.join(OUTPUT_DIR, 'pareto_front_summary.csv')
    ledger_path = os.path.join(OUTPUT_DIR, 'pareto_front_trade_ledger.csv')
    json_path = os.path.join(OUTPUT_DIR, 'pareto_front_setups.json')

    final_summary_df.to_csv(summary_path, index=False); print(f"Saved '{summary_path}'")
    trade_ledger_df.to_csv(ledger_path, index=False); print(f"Saved '{ledger_path}'")

    # Enriched JSON with clean date strings
    top_setups_for_json = final_summary_df.copy()
    top_setups_for_json.replace({np.nan: None, pd.NaT: None}, inplace=True)
    for col in ['first_trigger_date', 'last_trigger_date']:
        if col in top_setups_for_json.columns:
            top_setups_for_json[col] = pd.to_datetime(top_setups_for_json[col], errors='coerce').dt.strftime('%Y-%m-%d')
            top_setups_for_json[col].replace({pd.NaT: None}, inplace=True)
    top_setups_json = top_setups_for_json.to_dict('records')
    with open(json_path, 'w') as f:
        json.dump(top_setups_json, f, indent=2)
    print(f"Saved '{json_path}'")

    # ---------- JSON-safe full setups (dates serialized) ----------
    def _safe_dates(dates_idx):
        if dates_idx is None:
            return []
        try:
            idx = pd.to_datetime(pd.Index(dates_idx))
            return [d.strftime('%Y-%m-%d') for d in idx.to_pydatetime()]
        except Exception:
            return []

    enriched_records = []
    for setup in final_pareto_front:
        r = {k: v for k, v in setup.items()}

        # Normalize date-like fields into strings
        r['trigger_dates'] = _safe_dates(setup.get('trigger_dates'))

        if r.get('first_trigger_date') is not None:
            _ftd = pd.to_datetime(r['first_trigger_date'], errors='coerce')
            r['first_trigger_date'] = None if pd.isna(_ftd) else _ftd.strftime('%Y-%m-%d')

        if r.get('last_trigger_date') is not None:
            _ltd = pd.to_datetime(r['last_trigger_date'], errors='coerce')
            r['last_trigger_date'] = None if pd.isna(_ltd) else _ltd.strftime('%Y-%m-%d')

        # Keep signal_definitions as list of dicts
        r['signal_definitions'] = setup.get('signal_definitions', [])

        enriched_records.append(r)

    full_json_path = os.path.join(OUTPUT_DIR, 'pareto_front_setups_full.json')
    with open(full_json_path, 'w') as f:
        json.dump(enriched_records, f, indent=2, default=str)

    print(f"Saved '{full_json_path}'")

    # Widened pool
    cand_json_path = os.path.join(OUTPUT_DIR, 'candidate_pool_setups.json')
    with open(cand_json_path, 'w') as f:
        json.dump([dict(x) for x in latest_candidate_pool], f, indent=2)
    print(f"Saved '{cand_json_path}'")

    print('\nDiscovery complete.')
    print("\nSolutions on the Final Pareto Front (sorted by Sortino):")
    display_cols = ['setup_id', 'rank', 'best_performing_ticker', 'obj_sortino', 'obj_calmar', 'obj_support', 'recency_sharpe', 'description']
    print(final_summary_df[display_cols].head(25).to_string())
else:
    print('\nDiscovery complete (no final solutions).')



--- RUNNING IN SINGLE TICKER MODE FOR: AAPL US Equity ---
Loading raw workbooks…
Raw shape: (1984, 568)

Identified all relevant prefixes/tickers for feature engineering: 33

--- Defining ALL Feature Specifications ---
Defined 3359 total feature specifications.
--- Building raw feature set... ---
Could not build feature 'zscore_PX_LAST_30d__AAPL US Equity': min_periods 60 must be <= window 30
Could not build feature 'zscore_IVOL_SIGMA_30d__AAPL US Equity': min_periods 60 must be <= window 30
Could not build feature 'zscore_PX_LAST_30d__ARKK US Equity': min_periods 60 must be <= window 30
Could not build feature 'zscore_IVOL_SIGMA_30d__ARKK US Equity': min_periods 60 must be <= window 30
Could not build feature 'zscore_PX_LAST_30d__BSPGCPUS Index': min_periods 60 must be <= window 30
Could not build feature 'zscore_PX_LAST_30d__CO1 Comdty': min_periods 60 must be <= window 30
Could not build feature 'zscore_PX_LAST_30d__CONSSENT Index': min_periods 60 must be <= window 30
Could not bui

TypeError: Object of type DatetimeIndex is not JSON serializable