In [7]:
# alpha_discoveryV2
import pandas as pd
import numpy as np
import random
import warnings
import functools
import os
import itertools
import json
from joblib import Parallel, delayed

warnings.filterwarnings('ignore')

# --- DEFINITIONS AND CONFIGURATION ---
# Define the explicit list of tradable tickers
TRADABLE_TICKERS = [
    'QQQ US Equity', 'SPY US Equity', 'XLK US Equity', 'XLF US Equity',
    'XLE US Equity', 'ARKK US Equity', 'VIX Index', 'GLD US Equity',
    'NBIS US Equity', 'LLY US Equity', 'TSLA US Equity', 'AAPL US Equity',
    'NVDA US Equity'
]

# Define list of macro tickers to ensure their inclusion in feature generation
MACRO_TICKERS = [
    'DXY Curncy', 'USGG10YR Index', 'USGG2YR Index', 'CPI YOY Index',
    'INJCJC Index', 'FFA Comdty', 'LF94TRUU Index', 'CPI CHNG Index',
    'NFP TCH Index', 'JOBS US Equity', 'CTII10 Govt', 'USSW10 Curncy',
    'MLCX3CRT Index', 'FARBAST Index', 'BSPGCPUS Index', 'SPCSUSA Index',
    'SPCS20SM Index', 'CONSSENT Index', 'CO1 Comdty'
]

# --- UPDATED FILE PATHS ---
MAIN_DATA_FILE = 'All_tickers_new.xlsx'
MACRO_DATA_FILE = 'Macro_tickers_new.xlsx'

# --- GENETIC ALGORITHM CONFIGURATION ---
NUM_GENERATIONS = 10       # How many evolutionary cycles to run
POPULATION_SIZE = 50      # How many setups (individuals) in each generation
SETUP_LENGTHS_TO_EXPLORE = [2, 3] # Allow setups of 2 or 3 conditions
ELITISM_RATE = 0.1         # Percentage of the best setups to keep untouched for the next generation
MUTATION_RATE = 0.20       # ---GA v2.1: More aggressive mutation rate---
# General Configuration
MIN_INITIAL_SUPPORT_FILTER = 5
OPTION_SIM_HORIZONS_DAYS = [1, 3, 10, 21]
RISK_FREE_RATE = 0.01
RECENCY_WINDOW = 10 # How many recent trades to check for performance decay

# --- END DEFINITIONS AND CONFIGURATION ---


print('Loading raw workbooks…')


# --- MODIFIED Custom Data Loading Function ---
def load_and_merge_excel(file_path, header_row, existing_df=None):
    """Loads an Excel file, prepends sheet names to columns (except Date), and merges."""
    try:
        xls = pd.ExcelFile(file_path)
        current_df = existing_df.copy() if existing_df is not None else None
        for sh_name in xls.sheet_names:
            # Use the specified header_row to correctly read the file
            df = pd.read_excel(xls, sheet_name=sh_name, header=header_row)

            if 'Dates' in df.columns and 'Date' not in df.columns:
                df.rename(columns={'Dates': 'Date'}, inplace=True)
            if 'Date' not in df.columns:
                print(f"Warning: Sheet '{sh_name}' in '{file_path}' is missing a 'Date'/'Dates' column. Skipping sheet.")
                continue
            df.columns = [f"{sh_name}_{col}" if col != 'Date' else col for col in df.columns]
            if current_df is None:
                current_df = df
            else:
                df = df.loc[:,~df.columns.duplicated()]
                current_df = current_df.merge(df, on='Date', how='outer')
        return current_df
    except FileNotFoundError:
        print(f"Error: '{file_path}' not found.")
        return existing_df
    except Exception as e:
        print(f"An unexpected error occurred during Excel loading of '{file_path}': {e}")
        return existing_df


# --- MODIFIED Load main and macro data ---
# Load main tickers file, specifying headers are on Row 2 (index 1)
raw = load_and_merge_excel(MAIN_DATA_FILE, header_row=1)
if raw is not None and not raw.empty:
    # Load macro file, specifying headers are on Row 5 (index 4)
    raw = load_and_merge_excel(MACRO_DATA_FILE, header_row=4, existing_df=raw)
else:
    print("Main data could not be loaded, skipping macro data loading.")
    raw = pd.DataFrame()

# Final cleaning and indexing
if not raw.empty:
    raw = raw.sort_values('Date').reset_index(drop=True)
    raw.fillna(method='ffill', inplace=True)
    if 'Date' in raw.columns:
        raw['Date'] = pd.to_datetime(raw['Date'])
        raw = raw.drop_duplicates(subset=['Date'], keep='last')
        raw.set_index('Date', inplace=True)
        raw.index = pd.to_datetime(raw.index)
        raw.sort_index(inplace=True)
else:
    print("No data loaded. Raw DataFrame is empty.")
    raw = pd.DataFrame()

print('Raw shape:', raw.shape)

# --- Dynamic Ticker Identification ---
all_column_prefixes = sorted(list(set([c.split('_')[0] for c in raw.columns if '_' in c])))
COMMON_FEATURE_PREFIXES = ['Last', 'Open', 'High', 'Low', 'VWAP', 'Volume', 'IVOL', 'Implied', 'Total', '30', '10',
                           '60', 'Hist.', '1st', 'Put', 'Dates', 'CHG', 'FFA', 'INJCJC', 'NFP', 'JOBS', 'CPI', 'CTII10',
                           'LF94TRUU', 'SPX', 'USSW10', 'MLCX3CRT', 'FARBAST', 'BSPGCPUS', 'SPCSUSA', 'SPCS20SM',
                           'CONSSENT']
actual_ticker_prefixes = [p for p in all_column_prefixes if p not in COMMON_FEATURE_PREFIXES]
all_tickers = sorted(list(set(TRADABLE_TICKERS + actual_ticker_prefixes + MACRO_TICKERS)))
print(f'\nIdentified all relevant prefixes/tickers for feature engineering: {len(all_tickers)}')


# --- Helper functions ---
def first_col_containing(ticker_full_name, substr=''):
    if substr == 'PX_LAST':
        for potential_col in [f"{ticker_full_name}_Last_Price_PX_LAST", f"{ticker_full_name}_PX_LAST"]:
            if potential_col in raw.columns: return potential_col
    for c in raw.columns:
        if c.startswith(ticker_full_name) and substr in c: return c
    return None

def safe_series(col_name):
    return raw[col_name] if col_name and col_name in raw.columns else pd.Series(index=raw.index, dtype=float)

def frac_diff(series, d=0.5, window=100):
    weights = [1.];
    for k in range(1, len(series)): weights.append(-weights[-1] * (d - k + 1) / k)
    weights = np.array(weights[::-1]); output = pd.Series(index=series.index, dtype=float)
    for i in range(window, len(series)):
        subset = series.iloc[i - window + 1: i + 1]
        if len(subset) == len(weights[-window:]): output.iloc[i] = np.dot(weights[-window:], subset)
    return output.dropna()

def block_bootstrap_sharpe(returns_series, block_size, num_iterations=1000, annualize=True, trading_days_per_year=252):
    returns_series = returns_series.dropna()
    if len(returns_series) < block_size or len(returns_series) < 2: return 0.0, 0.0, 0.0
    blocks = [returns_series.iloc[i: i + block_size] for i in range(0, len(returns_series), block_size) if not returns_series.iloc[i: i + block_size].empty]
    if not blocks: return 0.0, 0.0, 0.0
    n_blocks_to_sample = int(np.ceil(len(returns_series) / block_size)); sharpes = []
    for _ in range(num_iterations):
        resampled_returns_list = [blocks[i] for i in np.random.choice(len(blocks), n_blocks_to_sample, replace=True)]
        resampled_returns = pd.concat(resampled_returns_list).iloc[:len(returns_series)]
        if resampled_returns.std() > 1e-9:
            sharpes.append((resampled_returns.mean() / resampled_returns.std()) * (np.sqrt(trading_days_per_year) if annualize else 1))
        else: sharpes.append(0.0)
    if not sharpes: return 0.0, 0.0, 0.0
    return np.median(sharpes), np.percentile(sharpes, 5), np.percentile(sharpes, 95)

def calculate_sortino_ratio(returns_series, annualize=True, trading_days_per_year=252):
    """Calculates the Sortino Ratio."""
    returns_series = returns_series.dropna()
    if len(returns_series) < 2: return 0.0

    target_return = 0
    downside_returns = returns_series[returns_series < target_return]

    if len(downside_returns) == 0: return np.inf

    expected_return = returns_series.mean()
    downside_std = downside_returns.std()

    if downside_std == 0: return np.inf

    sortino = (expected_return - target_return) / downside_std
    return sortino * np.sqrt(trading_days_per_year) if annualize else sortino

def calculate_calmar_ratio(returns_series, annualize=True, trading_days_per_year=252):
    """Calculates the Calmar Ratio."""
    returns_series = returns_series.dropna()
    if len(returns_series) < 2: return 0.0

    cumulative_returns = (1 + returns_series).cumprod()
    peak = cumulative_returns.cummax()
    drawdown = (cumulative_returns - peak) / peak
    max_drawdown = drawdown.min()

    if max_drawdown == 0: return np.inf

    total_return = cumulative_returns.iloc[-1] - 1

    # --- FIX: Guard against negative base for power calculation ---
    # If total return is <= -100%, the annualized geometric return is undefined.
    if (1 + total_return) < 0:
        return -99 # Return a large negative number to signify catastrophic performance

    num_years = len(returns_series) / trading_days_per_year
    annualized_return = (1 + total_return)**(1/num_years) - 1

    calmar = annualized_return / abs(max_drawdown)
    return calmar

# --- Option Simulation Helpers ---
def estimate_atm_premium(price, ivol, days, option_type):
    T = days / 365.25
    if T <= 0 or price <= 0 or ivol <= 0: return 0
    return 0.4 * price * ivol * np.sqrt(T)

def simulate_option_pnl_detailed(current_price, future_price, ivol_at_entry, horizon_days, entry_direction):
    underlying_return = (future_price - current_price) / current_price if current_price and pd.notna(current_price) and pd.notna(future_price) else np.nan
    nan_result = {'pnl_per_share': np.nan, 'option_type': None, 'strike_price': np.nan, 'entry_premium': np.nan,
                  'exit_value': np.nan, 'pnl_dollars': np.nan, 'pnl_pct': np.nan, 'skipped_reason': 'None',
                  'Underlying_Exit_Price': future_price if pd.notna(future_price) else np.nan, 'Return_Underlying': underlying_return,}
    if pd.isna(current_price) or current_price <= 0:
        nan_result['skipped_reason'] = 'Invalid Entry Price'; return nan_result
    if pd.isna(ivol_at_entry) or ivol_at_entry <= 0:
        nan_result['skipped_reason'] = 'Invalid IVOL'; return nan_result
    if pd.isna(future_price):
        nan_result['skipped_reason'] = 'Missing Future Price'; return nan_result
    if entry_direction not in ['long', 'short']:
        nan_result['skipped_reason'] = 'Invalid Entry Direction'; return nan_result

    scaled_ivol = ivol_at_entry / 100.0 if ivol_at_entry > 1.0 else ivol_at_entry
    strike_price = current_price
    option_type = 'call' if entry_direction == 'long' else 'put'
    entry_premium = estimate_atm_premium(current_price, scaled_ivol, horizon_days, option_type)

    if option_type == 'call': exit_value = max(future_price - strike_price, 0)
    else: exit_value = max(strike_price - future_price, 0)

    pnl_per_share = exit_value - entry_premium
    pnl_dollars = pnl_per_share * 100

    # Calculate P&L as a percentage of premium paid
# Calculate P&L as a percentage of premium paid
    pnl_pct = (pnl_per_share / entry_premium) * 100 if entry_premium > 0 else np.nan

    return {'pnl_per_share': pnl_per_share, 'option_type': option_type, 'strike_price': strike_price,
            'entry_premium': entry_premium, 'exit_value': exit_value, 'pnl_dollars': pnl_dollars,
            'pnl_pct': pnl_pct, 'skipped_reason': 'None', 'Underlying_Exit_Price': future_price,
            'Return_Underlying': underlying_return,}
# --- 1. Define Feature Specifications ---
print('\n--- Defining ALL Feature Specifications ---')
feature_specs = []
# Volatility Features
for ticker in all_tickers:
    f60 = '60_Day_Call_Implied_Volatility'; f10 = '10_Day_Call_Implied_Volatility'
    feature_specs.append({'type': 'ivol_term_structure', 'assets': [ticker], 'params': {'f_long':f60, 'f_short':f10},
                          'unique_id': f'term_structure_{f60}-{f10}__{ticker}', 'display_name': f"diff({f60}, {f10})__{ticker}"})
    put50 = '1st_Month_Put_Imp_Vol_50_Delta'; call40 = '1st_Month_Call_Imp_Vol_40_Delta'
    feature_specs.append({'type': 'ivol_skew', 'assets': [ticker], 'params': {'put':put50, 'call':call40},
                          'unique_id': f'skew_{put50}-{call40}__{ticker}', 'display_name': f"diff({put50}, {call40})__{ticker}"})
    for suffix in ['IVOL_SIGMA', 'CALL_IMP_VOL_30D', 'PUT_IMP_VOL_30D']:
        feature_specs.append({'type': 'ivol_shock', 'assets': [ticker], 'params': {'ivol_suffix': suffix, 'window': 30},
                              'unique_id': f'zscore_{suffix}_30d__{ticker}', 'display_name': f"zscore_{suffix}_30d__{ticker}"})
        feature_specs.append({'type': 'ivol_div_volume', 'assets': [ticker], 'params': {'ivol_suffix': suffix, 'vol_suffix':'VOLUME'},
                              'unique_id': f'div_{suffix}_by_VOLUME__{ticker}', 'display_name': f"div({suffix}, VOLUME)__{ticker}"})
# Deriv Flow & Sentiment Features
for ticker in all_tickers:
    pc_ratio_col = 'PUT_CALL_VOLUME_RATIO_CUR_DAY'
    feature_specs.append({'type': 'put_call_ratio_ema', 'assets': [ticker], 'params': {'span': 5, 'col': pc_ratio_col},
                          'unique_id': f'ema5_{pc_ratio_col}__{ticker}', 'display_name': f"ema5_{pc_ratio_col}__{ticker}"})
    oi_col = 'OPEN_INT_TOTAL_CALL'
    feature_specs.append({'type': 'open_interest_change', 'assets': [ticker], 'params': {'days': 3, 'col': oi_col},
                          'unique_id': f'pct_change_{oi_col}_3d__{ticker}', 'display_name': f"pct_change_{oi_col}_3d__{ticker}"})
    vol_col = 'Volume_-Realtime_VOLUME'
    feature_specs.append({'type': 'volume_zscore', 'assets': [ticker], 'params': {'window': 30, 'col': vol_col},
                          'unique_id': f'zscore_{vol_col}_30d__{ticker}', 'display_name': f"zscore_{vol_col}_30d__{ticker}"})
    sm_oi = 'OPEN_INT_TOTAL_CALL'; sm_ivol='10_Day_Call_Implied_Volatility'
    feature_specs.append({'type': 'smart_money_flag', 'assets': [ticker], 'params': {'oi_col': sm_oi, 'ivol_col': sm_ivol},
                          'unique_id': f'smart_money_{sm_oi}_{sm_ivol}__{ticker}', 'display_name': f"smart_money(pct_change({sm_oi}) > 0 AND pct_change({sm_ivol}) > 0)__{ticker}"})
# Generic Z-Score feature needed for sequential patterns
for ticker in all_tickers:
    for col in ['PX_LAST', 'IVOL_SIGMA', 'Volume_-Realtime_VOLUME']:
        for window in [30, 60]:
            feature_specs.append({'type': 'generic_zscore', 'assets': [ticker], 'params': {'col': col, 'window': window},
                                  'unique_id': f'zscore_{col}_{window}d__{ticker}', 'display_name': f"zscore({col}, {window}d)__{ticker}"})
# Cross-Asset Correlation Features
price_col = 'PX_LAST'
correlation_pairs = list(set(itertools.combinations(all_tickers, 2)))
for t1, t2 in correlation_pairs:
    for window in [20, 60]:
        feature_specs.append({'type': 'correlation', 'assets': [t1, t2], 'params': {'window': window, 'col': price_col},
                              'unique_id': f'corr_{t1}:{price_col}_{t2}:{price_col}_{window}d', 'display_name': f"corr({t1}:{price_col}, {t2}:{price_col}, {window}d)"})
    feature_specs.append({'type': 'correlation_zscore', 'assets': [t1, t2], 'params':{'col':price_col, 'window':60},
                          'unique_id': f'zscore_corr20d_{t1}:{price_col}_{t2}:{price_col}_60d', 'display_name': f"zscore_corr(20d)({t1}:{price_col}, {t2}:{price_col}, 60d)"})
    feature_specs.append({'type': 'correlation_delta', 'assets': [t1, t2], 'params':{'col':price_col},
                          'unique_id': f'corr_delta_{t1}:{price_col}_{t2}:{price_col}', 'display_name': f"corr_delta(20d-60d)({t1}:{price_col}, {t2}:{price_col})"})
    feature_specs.append({'type': 'rolling_beta', 'assets': [t1, t2], 'params': {'window': 60, 'col':price_col},
                          'unique_id': f'beta_{t1}:{price_col}_{t2}:{price_col}_60d', 'display_name': f"beta({t1}:{price_col}, {t2}:{price_col}, 60d)"})
# Advanced Correlations
adv_corr_defs = [
    {'t1': 'QQQ US Equity', 'f1': 'IVOL_SIGMA', 't2': 'SPY US Equity', 'f2': 'IVOL_SIGMA', 'win': 30},
    {'t1': 'TSLA US Equity', 'f1': 'Volume_-Realtime_VOLUME', 't2': 'VIX Index', 'f2': 'IVOL_SIGMA', 'win': 20},
    {'t1': 'CO1 Comdty', 'f1': 'PX_LAST', 't2': 'XLE US Equity', 'f2': 'IVOL_SIGMA', 'win': 30},
    {'t1': 'USGG10YR Index', 'f1': 'PX_LAST', 't2': 'XLF US Equity', 'f2': 'IVOL_SIGMA', 'win': 30}
]
for d in adv_corr_defs:
    feature_specs.append({
        'type': 'advanced_correlation', 'assets': [d['t1'], d['t2']], 'params': {'window': d['win'], 'col1': d['f1'], 'col2': d['f2']},
        'unique_id': f"corr_{d['t1']}:{d['f1']}_{d['t2']}:{d['f2']}_{d['win']}d", 'display_name': f"corr({d['t1']}:{d['f1']}, {d['t2']}:{d['f2']}, {d['win']}d)"})
# Macro Features
feature_specs.extend([
    {'type': 'macro_mpi', 'assets': ['DXY Curncy', 'USGG10YR Index'], 'unique_id': 'macro_mpi', 'display_name': 'Macro Pressure Index'},
    {'type': 'macro_fear_overdrive', 'assets': ['VIX Index', 'DXY Curncy', 'SPY US Equity'], 'unique_id': 'macro_fear_overdrive', 'display_name': 'Fear Overdrive'},
    {'type': 'macro_sector_rotation', 'assets': ['XLK US Equity', 'XLE US Equity'], 'unique_id': 'macro_xlk_xle_rotation', 'display_name': 'Sector Rotation (XLK-XLE)'},
    {'type': 'macro_yield_spread', 'assets': ['USGG10YR Index', 'USGG2YR Index'], 'unique_id': 'macro_10y2y_spread', 'display_name': 'Yield Spread (10Y-2Y)'},
    {'type': 'macro_cpi_zscore', 'assets': ['CPI YOY Index'], 'unique_id': 'macro_cpi_z', 'display_name': 'CPI Z-Score'},
    {'type': 'macro_injcjc_shock', 'assets': ['INJCJC Index'], 'unique_id': 'macro_jobless_claims_shock', 'display_name': 'Jobless Claims Shock'},
    {'type': 'macro_ffa_spread', 'assets': ['FFA Comdty', 'USGG2YR Index'], 'unique_id': 'macro_ffa_spread', 'display_name': 'Fed Funds Spread'},
    {'type': 'macro_lf94truu_vol_signal', 'assets': ['LF94TRUU Index'], 'unique_id': 'macro_hyg_vol_signal', 'display_name': 'HYG Vol Signal'}])
for t in ['CPI YOY Index', 'CPI CHNG Index', 'NFP TCH Index', 'JOBS US Equity']:
    feature_specs.append({'type': 'macro_generic_mom', 'assets': [t], 'params': {'days': 3}, 'unique_id': f'macro_mom3_{t}', 'display_name': f'Macro Mom3d({t})'})
for t in ['CTII10 Govt', 'USSW10 Curncy', 'MLCX3CRT Index', 'FARBAST Index', 'BSPGCPUS Index', 'SPCSUSA Index', 'SPCS20SM Index', 'CONSSENT Index']:
    feature_specs.append({'type': 'macro_generic_chg', 'assets': [t], 'unique_id': f'macro_chg_{t}', 'display_name': f'Macro Chg({t})'})
# Momentum / Volatility Fractal Features
for ticker in all_tickers:
    feature_specs.append({'type': 'mom_div_vol', 'assets': [ticker], 'params': {'price_col':price_col, 'mom_win':5, 'vol_win':20},
                          'unique_id': f'mom_div_vol_{price_col}_5d_20d__{ticker}', 'display_name': f"mom_div_vol({price_col}, 5d, 20d)__{ticker}"})
    feature_specs.append({'type': 'bollinger_pctB', 'assets': [ticker], 'params': {'window': 20, 'price_col':price_col},
                          'unique_id': f'pctB_{price_col}_20d__{ticker}', 'display_name': f"%B({price_col}, 20d)__{ticker}"})
    feature_specs.append({'type': 'fractional_differencing', 'assets': [ticker], 'params': {'d': 0.5, 'window': 100, 'price_col':price_col},
                          'unique_id': f'frac_diff_{price_col}_d0.5_100w__{ticker}', 'display_name': f"frac_diff({price_col}, d=0.5, win=100)__{ticker}"})
# Market Regime and Interaction Features
feature_specs.append({'type': 'regime_filter', 'assets': ['VIX Index'], 'params': {'threshold': 25, 'col': 'PX_LAST'},
                      'unique_id': 'REGIME_IS_HIGH_VOL', 'display_name': 'REGIME_IS_HIGH_VOL (VIX > 25)'})
feature_specs.append({'type': 'interaction', 'assets': [],
                      'params': {'feature1': 'zscore_IVOL_SIGMA_30d__AAPL US Equity', 'feature2': 'REGIME_IS_HIGH_VOL'},
                      'unique_id': 'zscore_IVOL_SIGMA_30d__AAPL US Equity_IN_HIGH_VOL',
                      'display_name': 'zscore(IVOL_SIGMA, 30d)__AAPL US Equity IN_HIGH_VOL'})
print(f"Defined {len(feature_specs)} total feature specifications.")

# --- 2. Calculate Features Based on Specifications ---
print('--- Calculating All Features ---')
feat = pd.DataFrame(index=raw.index)
# The order of calculation matters for interaction features, so iterate twice.
# First pass for all primary features
for spec in feature_specs:
    if spec['type'] == 'interaction': continue # Skip interaction features on the first pass
    feature_id = spec['unique_id']
    try:
        if spec['type'] == 'ivol_term_structure':
            ivol60 = safe_series(first_col_containing(spec['assets'][0], spec['params']['f_long'])); ivol10 = safe_series(first_col_containing(spec['assets'][0], spec['params']['f_short']))
            if not ivol60.empty and not ivol10.empty: feat[feature_id] = ivol60 - ivol10
        elif spec['type'] == 'ivol_skew':
            put50 = safe_series(first_col_containing(spec['assets'][0], spec['params']['put'])); call40 = safe_series(first_col_containing(spec['assets'][0], spec['params']['call']))
            if not put50.empty and not call40.empty: feat[feature_id] = put50 - call40
        elif spec['type'] == 'ivol_shock':
            ivol_s = safe_series(first_col_containing(spec['assets'][0], spec['params']['ivol_suffix']))
            if not ivol_s.empty: feat[feature_id] = (ivol_s.diff() - ivol_s.diff().rolling(spec['params']['window']).mean()) / ivol_s.diff().rolling(spec['params']['window']).std()
        elif spec['type'] == 'ivol_div_volume':
            ivol_s = safe_series(first_col_containing(spec['assets'][0], spec['params']['ivol_suffix'])); vol_s = safe_series(first_col_containing(spec['assets'][0], spec['params']['vol_suffix']))
            if not ivol_s.empty and not vol_s.empty: feat[feature_id] = ivol_s / vol_s.replace(0, np.nan)
        elif spec['type'] == 'put_call_ratio_ema':
            pc = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            if not pc.empty: feat[feature_id] = pc.ewm(span=spec['params']['span'], adjust=False).mean()
        elif spec['type'] == 'open_interest_change':
            oi = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            if not oi.empty: feat[feature_id] = oi.pct_change(spec['params']['days'])
        elif spec['type'] == 'volume_zscore':
            vol = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            if not vol.empty: feat[feature_id] = (vol - vol.rolling(spec['params']['window']).mean()) / vol.rolling(spec['params']['window']).std()
        elif spec['type'] == 'smart_money_flag':
            oi = safe_series(first_col_containing(spec['assets'][0], spec['params']['oi_col'])).pct_change() > 0; ivol = safe_series(first_col_containing(spec['assets'][0], spec['params']['ivol_col'])).pct_change() > 0
            if not oi.empty and not ivol.empty: feat[feature_id] = (oi & ivol).astype(int)
        elif spec['type'] == 'generic_zscore':
            s = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            if not s.empty: feat[feature_id] = (s - s.rolling(spec['params']['window']).mean()) / s.rolling(spec['params']['window']).std()
        elif spec['type'] == 'correlation':
            t1, t2 = spec['assets']; p1, p2 = first_col_containing(t1, spec['params']['col']), first_col_containing(t2, spec['params']['col'])
            if p1 and p2:
                aligned = pd.DataFrame({'s1': safe_series(p1), 's2': safe_series(p2)}).dropna()
                if len(aligned) > spec['params']['window']: feat[feature_id] = aligned['s1'].rolling(spec['params']['window']).corr(aligned['s2'])
        elif spec['type'] == 'advanced_correlation':
            t1, t2 = spec['assets']; s1_col = first_col_containing(t1, spec['params']['col1']); s2_col = first_col_containing(t2, spec['params']['col2'])
            if s1_col and s2_col:
                aligned = pd.DataFrame({'s1': safe_series(s1_col), 's2': safe_series(s2_col)}).dropna()
                if len(aligned) > spec['params']['window']: feat[feature_id] = aligned['s1'].rolling(spec['params']['window']).corr(aligned['s2'])
        elif spec['type'] in ['correlation_zscore', 'correlation_delta']:
            t1, t2 = spec['assets']; price_col_name = spec['params']['col']; c20_id = f'corr_{t1}:{price_col_name}_{t2}:{price_col_name}_20d'; c60_id = f'corr_{t1}:{price_col_name}_{t2}:{price_col_name}_60d'
            c20 = feat.get(c20_id); c60 = feat.get(c60_id)
            if c20 is not None and c60 is not None:
                if spec['type'] == 'correlation_zscore': feat[feature_id] = (c20 - c20.rolling(spec['params']['window']).mean()) / c20.rolling(spec['params']['window']).std()
                else: feat[feature_id] = c20 - c60
        elif spec['type'] == 'rolling_beta':
            t1, t2 = spec['assets']; p1, p2 = first_col_containing(t1, spec['params']['col']), first_col_containing(t2, spec['params']['col'])
            if p1 and p2:
                rets = pd.DataFrame({'r1': safe_series(p1).pct_change(), 'r2': safe_series(p2).pct_change()}).dropna()
                if len(rets) > spec['params']['window']: feat[feature_id] = rets['r1'].rolling(spec['params']['window']).cov(rets['r2']) / rets['r2'].rolling(spec['params']['window']).var()
        elif spec['type'] == 'macro_mpi':
            dxy, ust10 = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')), safe_series(first_col_containing(spec['assets'][1], 'PX_LAST'))
            if not dxy.empty and not ust10.empty: feat[feature_id] = dxy.pct_change().rolling(3).sum() + ust10.pct_change().rolling(3).sum()
        elif spec['type'] == 'macro_fear_overdrive':
            vix, dxy, spy = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')), safe_series(first_col_containing(spec['assets'][1], 'PX_LAST')), safe_series(first_col_containing(spec['assets'][2], 'PX_LAST'))
            if not vix.empty and not dxy.empty and not spy.empty: feat[feature_id] = ((vix > 20) & (dxy.pct_change() > 0) & (spy < spy.rolling(20).mean())).astype(int)
        elif spec['type'] == 'macro_sector_rotation':
            xlk, xle = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')), safe_series(first_col_containing(spec['assets'][1], 'PX_LAST'))
            if not xlk.empty and not xle.empty: feat[feature_id] = xlk.pct_change(5) - xle.pct_change(5)
        elif spec['type'] == 'macro_yield_spread':
            ust10, ust2 = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')), safe_series(first_col_containing(spec['assets'][1], 'PX_LAST'))
            if not ust10.empty and not ust2.empty: feat[feature_id] = ust10 - ust2
        elif spec['type'] == 'macro_generic_mom':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            if not px.empty: feat[feature_id] = px.pct_change(spec['params']['days'])
        elif spec['type'] == 'macro_generic_chg':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            if not px.empty: feat[feature_id] = px.pct_change()
        elif spec['type'] == 'macro_cpi_zscore':
            cpi = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            if not cpi.empty: feat[feature_id] = (cpi - cpi.rolling(12).mean()) / cpi.rolling(12).std()
        elif spec['type'] == 'macro_injcjc_shock':
            injcjc = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            if not injcjc.empty: feat[feature_id] = (injcjc.diff() > injcjc.diff().rolling(20).std() * 2).astype(int)
        elif spec['type'] == 'macro_ffa_spread':
            ffa, ust2 = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')), safe_series(first_col_containing(spec['assets'][1], 'PX_LAST'))
            if not ffa.empty and not ust2.empty: feat[feature_id] = ffa - ust2
        elif spec['type'] == 'macro_lf94truu_vol_signal':
            vol = safe_series(first_col_containing(spec['assets'][0], 'VOLATILITY_30D'))
            if not vol.empty: feat[feature_id] = vol / vol.rolling(60).mean()
        elif spec['type'] == 'regime_filter':
            px = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            if not px.empty: feat[feature_id] = px > spec['params']['threshold']
        elif spec['type'] == 'mom_div_vol':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            if not px.empty: feat[feature_id] = px.pct_change(5) / px.pct_change().rolling(20).std()
        elif spec['type'] == 'bollinger_pctB':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            if not px.empty:
                ma = px.rolling(spec['params']['window']).mean();
                std = px.rolling(spec['params']['window']).std()
                feat[feature_id] = (px - (ma - 2 * std)) / (4 * std)
        elif spec['type'] == 'fractional_differencing':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            if not px.empty and len(px) > spec['params']['window']: feat[feature_id] = frac_diff(px, d=spec['params']['d'], window=spec['params']['window'])
    except Exception as e:
        print(f"Could not calculate feature '{feature_id}': {e}")
# Second pass for interaction features
for spec in feature_specs:
    if spec['type'] == 'interaction':
        feature_id = spec['unique_id']
        try:
            f1 = spec['params']['feature1']; f2 = spec['params']['feature2']
            if f1 in feat.columns and f2 in feat.columns:
                feat[feature_id] = feat[f1] * feat[f2]
        except Exception as e:
            print(f"Could not calculate interaction feature '{feature_id}': {e}")
feat = feat.shift(1)
print(f"Calculated {feat.shape[1]} feature series.")

# Sequential Features
try:
    vix_vol_zscore_feat_name = 'zscore_IVOL_SIGMA_30d__VIX Index'; qqq_spy_corr_zscore_feat_name = 'zscore_corr20d_QQQ US Equity:PX_LAST_SPY US Equity:PX_LAST_60d'
    event_A_series = (feat[vix_vol_zscore_feat_name] > 1.5); event_B_series = (feat[qqq_spy_corr_zscore_feat_name] < -1.5)
    sequential_feature_name = 'SEQ_VIX_SPIKE_THEN_CORR_DROP'; feat[sequential_feature_name] = event_B_series & event_A_series.shift(1)
    print(f"Successfully created sequential feature: '{sequential_feature_name}'")
    yield_zscore_name = 'zscore_PX_LAST_60d__USGG10YR Index'; gold_vol_zscore_name = 'zscore_IVOL_SIGMA_30d__GLD US Equity'
    event_A_series = (feat[yield_zscore_name] < -1.5); event_B_series = (feat[gold_vol_zscore_name] > 1.5)
    sequential_feature_name = 'SEQ_YIELD_DROP_THEN_GOLD_VOL_SPIKE'; feat[sequential_feature_name] = event_B_series & event_A_series.shift(1)
    print(f"Successfully created sequential feature: '{sequential_feature_name}'")
    nvda_vol_zscore_name = 'zscore_Volume_-Realtime_VOLUME_30d__NVDA US Equity'; qqq_price_zscore_name = 'zscore_PX_LAST_60d__QQQ US Equity'
    event_A_series = (feat[nvda_vol_zscore_name] > 1.5); event_B_series = (feat[qqq_price_zscore_name] > 1.5)
    sequential_feature_name = 'SEQ_NVDA_VOL_SPIKE_THEN_QQQ_PRICE_RISE'; feat[sequential_feature_name] = event_B_series & event_A_series.shift(1)
    print(f"Successfully created sequential feature: '{sequential_feature_name}'")
except KeyError as e:
    print(f"Warning: Could not create sequential feature. A component feature was not found: {e}")

# --- 3. Define Primitive Signals from Features ---
print('--- Defining Primitive Signals ---')
primitive_signals = []
signal_series = {}
signal_id_counter = 0
for feature_id in feat.columns:
    s = feat[feature_id].replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty: continue

    # --- FIX: Robust check for boolean-like features ---
    # This checks if the series contains only values equivalent to True/False or 1/0.
    is_boolean_like = set(s.unique()).issubset({0, 1, True, False})

    if is_boolean_like:
        if s.std() == 0: continue # Skip if all values are the same (e.g., all True or all False)
        sig_id = f"SIG_{signal_id_counter}"; signal_id_counter += 1
        primitive_signals.append({'signal_id': sig_id, 'feature_id': feature_id, 'condition_type': 'boolean', 'operator': '==', 'value': True})
        signal_series[sig_id] = (s == True)
        continue # IMPORTANT: Skip numeric signal generation for boolean features

    if s.std() == 0: continue # Skip non-boolean features with no variance

    # --- Generate percentile signals ---
    for op, val in [('>', 0.8), ('<', 0.2)]:
        sig_id = f"SIG_{signal_id_counter}"; signal_id_counter += 1
        primitive_signals.append({'signal_id': sig_id, 'feature_id': feature_id, 'condition_type': 'percentile', 'operator': op, 'value': val})
        signal_series[sig_id] = s.rank(pct=True).apply(lambda x: x > val if op == '>' else x < val)

    # --- Generate z-score signals ---
    rolling_std = s.rolling(60).std()
    valid_std_mask = rolling_std > 1e-9
    if not valid_std_mask.any(): continue

    z = pd.Series(np.nan, index=s.index)
    z[valid_std_mask] = (s - s.rolling(60).mean())[valid_std_mask] / rolling_std[valid_std_mask]

    for op, val in [('>', 1.5), ('<', -1.5)]:
        sig_id = f"SIG_{signal_id_counter}"; signal_id_counter += 1
        primitive_signals.append({'signal_id': sig_id, 'feature_id': feature_id, 'condition_type': 'z_score', 'operator': op, 'value': val})
        signal_series[sig_id] = z.apply(lambda x: x > val if op == '>' else x < val)

print(f"Defined {len(primitive_signals)} primitive signals.")

# --- Prepare Returns for Evaluation ---
price_cols_for_returns = [first_col_containing(t, 'PX_LAST') for t in TRADABLE_TICKERS if first_col_containing(t, 'PX_LAST')]
prices = raw[price_cols_for_returns].copy()
returns = {h: prices.pct_change(h).shift(-h) for h in [1, 3, 5, 10, 21]}

# --- 4. GENETIC ALGORITHM: Evolve Powerful Setups ---

# --- GENETIC ALGORITHM HELPERS ---
def get_setup_dna(setup):
    """Creates a unique, hashable identifier for a setup based on its signals."""
    return tuple(sorted([s['signal_id'] for s in setup['signal_definitions']]))

def crossover(parent1, parent2):
    """Creates a new child setup by combining DNA from two parents."""
    child_signals = [random.choice(parent1['signal_definitions']), random.choice(parent2['signal_definitions'])]
    # Combine signals from both parents for potentially larger setups
    if len(parent1['signal_definitions']) > 1 and len(parent2['signal_definitions']) > 1:
        child_signals.append(random.choice(parent1['signal_definitions']))
        child_signals.append(random.choice(parent2['signal_definitions']))
    # Ensure no duplicate signals in the child and respect max length
    child_signals = list({s['signal_id']: s for s in child_signals}.values())
    if len(child_signals) > max(SETUP_LENGTHS_TO_EXPLORE):
        child_signals = random.sample(child_signals, max(SETUP_LENGTHS_TO_EXPLORE))
    return {'id': 'child', 'signal_definitions': child_signals}

def mutate(setup, all_signal_ids, mutation_rate):
    """Randomly changes one signal in a setup's DNA."""
    if random.random() < mutation_rate:
        idx_to_mutate = random.randint(0, len(setup['signal_definitions']) - 1)
        new_signal_id = random.choice(all_signal_ids)
        new_signal_def = next(p for p in primitive_signals if p['signal_id'] == new_signal_id)
        # Avoid mutating to a signal that's already in the setup
        if new_signal_def['signal_id'] not in [s['signal_id'] for s in setup['signal_definitions']]:
            setup['signal_definitions'][idx_to_mutate] = new_signal_def
    return setup

def tournament_selection(population_df, k=5):
    """Selects a parent by picking the best from a small random group."""
    # Ensure tournament size is not larger than population
    k = min(k, len(population_df))
    # Select k random individuals from the population
    contenders = population_df.sample(n=k)
    # The winner is the one with the highest fitness
    winner = contenders.sort_values('fitness', ascending=False).iloc[0]
    return winner.to_dict()

# --- English Description Generator ---
def generate_english_description(setup_id, signal_defs, feature_specs_list):
    clauses = [];
    for s_def in signal_defs:
        feat_name = next((f_spec['display_name'] for f_spec in feature_specs_list if f_spec['unique_id'] == s_def['feature_id']), s_def.get('feature_id', 'unknown_feature'))
        if s_def['condition_type'] == 'boolean': clauses.append(f"{feat_name} is true")
        elif s_def['condition_type'] == 'percentile':
            level = "is very high" if s_def['operator'] == '>' else "is very low"; clauses.append(f"{feat_name} {level}")
        else:
            level = "surges unexpectedly" if s_def['operator'] == '>' else "drops sharply"; clauses.append(f"{feat_name} {level}")
    description = f"When {clauses[0]}"
    if len(clauses) > 1: description += f" and {' and '.join(clauses[1:])}"
    direction_score = sum(1 if s['operator'] == '>' else -1 for s in signal_defs); bias = 'a bullish' if direction_score > 0 else 'a bearish' if direction_score < 0 else 'an uncertain'
    description += f", it may indicate {bias} outlook."
    return {'setup_id': setup_id, 'description': description, 'explained_description': "DEPRECATED"}

# --- Parallel Setup Evaluation Function ---
# === START BLOCK TO REPLACE: evaluate_one_setup function ===
# --- Parallel Setup Evaluation Function (MODIFIED FOR STABILITY) ---
def evaluate_one_setup(setup, returns, feature_specs_list):
    """
    Evaluates a single setup and returns ONLY summary statistics.
    The trade ledger will be built separately for top performers later.
    """
    sid, signal_defs = setup['id'], setup['signal_definitions']
    try:
        mask = functools.reduce(lambda a, b: a & b, [signal_series[s['signal_id']] for s in signal_defs])
        dates = mask[mask].index
    except (KeyError, TypeError):
        return None

    if len(dates) < MIN_INITIAL_SUPPORT_FILTER:
        return None

    direction_score = sum(1 if s['operator'] == '>' else -1 for s in signal_defs if s['condition_type'] != 'boolean')
    if direction_score == 0 and any(s['condition_type'] != 'boolean' for s in signal_defs): return None
    elif all(s['condition_type'] == 'boolean' for s in signal_defs): direction_score = 1
    entry_direction = 'long' if direction_score > 0 else 'short'

    feature_types = [spec.get('type', 'unknown') for sig_def in signal_defs for spec in feature_specs_list if spec.get('unique_id') == sig_def.get('feature_id')]
    dominant_signal_type = max(set(feature_types), key=feature_types.count) if feature_types else 'sequential' if any('SEQ' in s['feature_id'] for s in signal_defs) else 'unknown'

    summary_rows_for_setup = []
    price_cols_for_returns = [first_col_containing(t, 'PX_LAST') for t in TRADABLE_TICKERS if first_col_containing(t, 'PX_LAST')]

    for tk_col in price_cols_for_returns:
        tk_symbol = next((ticker for ticker in TRADABLE_TICKERS if tk_col.startswith(ticker)), None)
        if tk_symbol is None: continue

        summary_row = {
            'setup_id': sid, 'target_ticker': tk_symbol,
            'support': len(dates), 'entry_direction': entry_direction, 'dominant_signal_type': dominant_signal_type,
            'first_trigger_date': dates.min(), 'last_trigger_date': dates.max()
        }

        perf_horizons = [1, 3, 10, 21]
        horizon_sharpes = {}
        # Pre-calculate directional returns for all horizons to avoid re-work
        directional_returns = {}
        for h in perf_horizons:
            r_ticker = returns[h][tk_col].reindex(dates)
            if entry_direction == 'short':
                r_ticker = -r_ticker
            directional_returns[h] = r_ticker

        for h in perf_horizons:
            r_h = directional_returns[h]
            if not r_h.empty and len(r_h.dropna()) > 5:
                if r_h.std() < 1e-9: sharpe, _, _ = 0.0, 0.0, 0.0
                else: sharpe, _, _ = block_bootstrap_sharpe(r_h, block_size=max(1,h//2))
                summary_row[f'sharpe_{h}d'] = sharpe
                horizon_sharpes[f'{h}d'] = sharpe
            else:
                summary_row[f'sharpe_{h}d'] = 0.0

        best_horizon_str = max(horizon_sharpes, key=horizon_sharpes.get) if horizon_sharpes else 'N/A'
        summary_row['best_sharpe_horizon'] = best_horizon_str
        summary_rows_for_setup.append(summary_row)

    return summary_rows_for_setup
# === END BLOCK TO REPLACE: evaluate_one_setup function ===

# === START BLOCK TO REPLACE: The entire section from `Create Initial Population` to the end of the script ===
# --- Step 1: Create Initial Population (Generation 0) ---
hall_of_fame_setup = {'id': 'HOF', 'signal_definitions': [], 'fitness': -999}
print('\n--- GENETIC ALGORITHM: Creating Initial Population (Generation 0) ---')
all_signal_ids = [s['signal_id'] for s in primitive_signals]
current_population = []
setup_id_counter = 0
existing_dna = set()

seed_quotas = {
    'sequential': {'signals': [s['signal_id'] for s in primitive_signals if 'SEQ' in s['feature_id']], 'count': int(POPULATION_SIZE * 0.1)},
    'regime': {'signals': [s['signal_id'] for s in primitive_signals if 'REGIME' in s['feature_id'] or 'IN_HIGH_VOL' in s['feature_id']], 'count': int(POPULATION_SIZE * 0.1)},
}
print("Seeding initial population with diverse feature types...")
for seed_type, info in seed_quotas.items():
    created_count = 0
    attempts = 0
    simple_signals = [s['signal_id'] for s in primitive_signals if s not in info['signals']]
    if not simple_signals: continue # Skip if no simple signals to pair with
    while created_count < info['count'] and attempts < 500:
        if not info['signals']: break
        seed_signal_id = random.choice(info['signals'])
        partner_signal_id = random.choice(simple_signals)
        sig_id_list = [seed_signal_id, partner_signal_id]
        temp_setup = {'signal_definitions': [p for p in primitive_signals if p['signal_id'] in sig_id_list]}
        dna = get_setup_dna(temp_setup)
        if dna in existing_dna: continue
        try:
            mask = functools.reduce(lambda a, b: a & b, [signal_series[sid] for sid in sig_id_list])
            if mask.sum() >= MIN_INITIAL_SUPPORT_FILTER:
                temp_setup['id'] = f'S{setup_id_counter:04d}'
                current_population.append(temp_setup); existing_dna.add(dna); setup_id_counter += 1; created_count += 1
        except (KeyError, TypeError): pass
        attempts += 1
    print(f"- Created {created_count} setups with '{seed_type}' features.")

print("Filling rest of population randomly...")
while len(current_population) < POPULATION_SIZE:
    k = random.choice(SETUP_LENGTHS_TO_EXPLORE)
    sig_id_list = random.sample(all_signal_ids, k)
    temp_setup = {'signal_definitions': [p for p in primitive_signals if p['signal_id'] in sig_id_list]}
    dna = get_setup_dna(temp_setup)
    if dna in existing_dna: continue
    try:
        mask = functools.reduce(lambda a, b: a & b, [signal_series[sid] for sid in sig_id_list])
        if mask.sum() >= MIN_INITIAL_SUPPORT_FILTER:
            temp_setup['id'] = f'S{setup_id_counter:04d}'
            current_population.append(temp_setup); existing_dna.add(dna); setup_id_counter += 1
    except (KeyError, TypeError): pass
print(f"Created initial population of {len(current_population)} unique setups.")

# --- Step 2: The Main Evolutionary Loop ---
hall_of_fame_setup = {'id': 'HOF_CHAMPION', 'signal_definitions': [], 'fitness': -999}
for generation in range(NUM_GENERATIONS):
    print(f"\n--- Evaluating Generation {generation + 1}/{NUM_GENERATIONS} ---")
    results = Parallel(n_jobs=-1)(delayed(evaluate_one_setup)(setup, returns, feature_specs) for setup in current_population)
    population_df = pd.DataFrame(current_population)
    setup_fitness = {}
    for i, summary_rows in enumerate(results):
        setup_id = current_population[i]['id']
        if summary_rows:
            all_sharpes = [row.get(f'sharpe_{h}d', -99) for row in summary_rows for h in [1, 3, 10, 21]]
            max_sharpe = np.max([s for s in all_sharpes if s is not None]) if all_sharpes else -99
            setup_fitness[setup_id] = max_sharpe
        else:
            setup_fitness[setup_id] = -99
    population_df['fitness'] = population_df['id'].map(setup_fitness).fillna(-99)

    def get_dominant_type(setup):
        feature_types = [spec.get('type', 'unknown') for sig_def in setup['signal_definitions'] for spec in feature_specs if spec.get('unique_id') == sig_def.get('feature_id')]
        return max(set(feature_types), key=feature_types.count) if feature_types else 'unknown'
    population_df['dominant_type'] = population_df.apply(get_dominant_type, axis=1)
    niche_count = population_df.nlargest(int(POPULATION_SIZE * 0.2), 'fitness')['dominant_type'].value_counts()
    population_df['niche_count'] = population_df['dominant_type'].map(niche_count).fillna(1)
    penalty = 1 / (1 + np.log1p(population_df['niche_count']))
    population_df['adjusted_fitness'] = population_df['fitness'] * penalty
    population_df = population_df.sort_values('adjusted_fitness', ascending=False).reset_index(drop=True)

    best_of_gen_df = population_df.sort_values('fitness', ascending=False).iloc[0]
    print(f"Generation {generation + 1} Complete. Best Fitness (Raw Sharpe): {best_of_gen_df['fitness']:.2f}")

    if best_of_gen_df['fitness'] > hall_of_fame_setup['fitness']:
        print(f"--- New Hall of Fame Champion Found! Fitness: {best_of_gen_df['fitness']:.2f} ---")
        hall_of_fame_setup.update(best_of_gen_df.to_dict())

    next_generation = population_df.head(int(POPULATION_SIZE * ELITISM_RATE)).to_dict('records')
    existing_dna = {get_setup_dna(s) for s in next_generation}
    while len(next_generation) < POPULATION_SIZE:
        parent1 = tournament_selection(population_df, k=3)
        parent2 = tournament_selection(population_df, k=3)
        child = crossover(parent1, parent2)
        child = mutate(child, all_signal_ids, MUTATION_RATE)
        dna = get_setup_dna(child)
        if dna in existing_dna: continue
        try:
            mask = functools.reduce(lambda a, b: a & b, [signal_series[s['signal_id']] for s in child['signal_definitions']])
            if mask.sum() >= MIN_INITIAL_SUPPORT_FILTER:
                child['id'] = f'S{setup_id_counter:04d}'; setup_id_counter += 1
                next_generation.append(child); existing_dna.add(dna)
        except (KeyError, TypeError): continue
    current_population = next_generation

# --- Final Evaluation and Output Generation ---
print("\n--- Genetic Algorithm Complete. Running Final Evaluation ---")
final_population = current_population
if get_setup_dna(hall_of_fame_setup) not in {get_setup_dna(s) for s in final_population}:
    print("Hall of Fame champion was not in the final generation. Adding it for final evaluation.")
    final_population.append(hall_of_fame_setup)

final_results = Parallel(n_jobs=-1)(delayed(evaluate_one_setup)(setup, returns, feature_specs) for setup in final_population)
summary_rows = [row for res in final_results if res for row in res]
if not summary_rows:
    print("Discovery complete. No valid setups were found.")
else:
    summary_df = pd.DataFrame(summary_rows)
    # Get fitness map for final sorting
    final_fitness_map = summary_df.groupby('setup_id')[['sharpe_1d', 'sharpe_3d', 'sharpe_10d', 'sharpe_21d']].max().max(axis=1)
    summary_df['fitness'] = summary_df['setup_id'].map(final_fitness_map)
    summary_df = summary_df.sort_values('fitness', ascending=False).reset_index(drop=True)

    # --- Build Ledger & Tier 1 Metrics for Top N Setups ---
    TOP_N_FOR_DEEP_DIVE = 50 # Build ledger only for the top 50 setups
    top_setups_df = summary_df.drop_duplicates(subset=['setup_id']).head(TOP_N_FOR_DEEP_DIVE)

    print(f"\n--- Performing deep dive on Top {len(top_setups_df)} setups (building ledger, recency...) ---")
    all_trade_ledger_rows = []
    all_description_records = []
    recency_sharpe_map = {}

    # This loop is now serial and targeted
    for _, setup_row in top_setups_df.iterrows():
        setup_id = setup_row['setup_id']
        setup_def = next((p for p in final_population if p['id'] == setup_id), None)
        if not setup_def: continue

        # Generate description
        all_description_records.append(generate_english_description(setup_id, setup_def['signal_definitions'], feature_specs))

        # Re-create mask and dates to ensure data integrity
        mask = functools.reduce(lambda a, b: a & b, [signal_series[s['signal_id']] for s in setup_def['signal_definitions']])
        dates = mask[mask].index

        # --- Recency Metric (Meaningful Version) ---
        if len(dates) >= RECENCY_WINDOW:
            recent_dates = dates[-RECENCY_WINDOW:]
            best_h_str = setup_row['best_sharpe_horizon']
            if best_h_str != 'N/A':
                best_h = int(best_h_str.replace('d', ''))

                # --- FIX: Use the helper function to find the correct price column ---
                price_col_name = first_col_containing(setup_row['target_ticker'], 'PX_LAST')
                if price_col_name:
                    r_recent = returns[best_h][price_col_name].reindex(recent_dates)
                    if setup_row['entry_direction'] == 'short': r_recent = -r_recent

                    if r_recent.std() > 1e-9 and len(r_recent.dropna()) > 2:
                        recency_sharpe_map[setup_id] = (r_recent.mean() / r_recent.std()) * np.sqrt(252)
                    else:
                        recency_sharpe_map[setup_id] = 0

        # --- Build Trade Ledger (Serial) ---
        tk_symbol = setup_row['target_ticker']
        tk_col = first_col_containing(tk_symbol, 'PX_LAST')
        ivol_col = (first_col_containing(tk_symbol, '30_Day_Call_Implied_Volatility') or first_col_containing(tk_symbol, 'IVOL_SIGMA'))
        ivol_series = raw[ivol_col].reindex(dates) if ivol_col else pd.Series(np.nan, index=dates)
        entry_px_series = raw[tk_col].reindex(dates)

        for i, d in enumerate(dates):
            entry_px = entry_px_series.iloc[i]
            ivol = ivol_series.iloc[i] if not ivol_series.empty else np.nan
            for h_opt in OPTION_SIM_HORIZONS_DAYS:
                exit_px_for_h = raw[tk_col].reindex([d + pd.Timedelta(days=h_opt)], method='nearest', tolerance=pd.Timedelta(days=3))
                final_exit_px = exit_px_for_h.iloc[0] if not exit_px_for_h.empty else np.nan
                pnl_detail = simulate_option_pnl_detailed(entry_px, final_exit_px, ivol, h_opt, setup_row['entry_direction'])
                all_trade_ledger_rows.append({'setup_id': setup_id, 'trigger_date': d, 'target_ticker': tk_symbol, 'horizon_days': h_opt, **pnl_detail})

    # --- Final Assembly and Output ---
    trade_ledger_df = pd.DataFrame(all_trade_ledger_rows)
    description_df = pd.DataFrame(all_description_records).drop_duplicates(subset=['setup_id'])

    # Add new metrics to summary_df and format for readability
    summary_df['recency_sharpe'] = summary_df['setup_id'].map(recency_sharpe_map)

    if not trade_ledger_df.empty:
        for h_opt in OPTION_SIM_HORIZONS_DAYS:
            pnl_dollars_map = trade_ledger_df[trade_ledger_df['horizon_days'] == h_opt].groupby('setup_id')['pnl_dollars'].mean()
            pnl_pct_map = trade_ledger_df[trade_ledger_df['horizon_days'] == h_opt].groupby('setup_id')['pnl_pct'].mean()
            summary_df[f'avg_option_pnl_dollars_{h_opt}d'] = summary_df['setup_id'].map(pnl_dollars_map)
            summary_df[f'avg_option_pnl_pct_{h_opt}d'] = summary_df['setup_id'].map(pnl_pct_map)

    # Round numeric columns for readability
    numeric_cols = summary_df.select_dtypes(include=np.number).columns
    summary_df[numeric_cols] = summary_df[numeric_cols].round(4)
    if 'pnl_pct' in trade_ledger_df.columns:
        trade_ledger_df['pnl_pct'] = trade_ledger_df['pnl_pct'].round(4)
    if 'pnl_dollars' in trade_ledger_df.columns:
        trade_ledger_df['pnl_dollars'] = trade_ledger_df['pnl_dollars'].round(2)


    # Merge descriptions into summary
    final_summary_df = pd.merge(summary_df, description_df[['setup_id', 'description']], on='setup_id', how='left')

    print('\n--- Generating Final Output Files ---')
    final_summary_df.to_csv('setup_results_summary.csv', index=False)
    print("Saved 'setup_results_summary.csv'")
    trade_ledger_df.to_csv('trade_ledger.csv', index=False)
    print("Saved 'trade_ledger.csv'")

# --- Save and Print Top Setups ---
    print("\n--- Generating Final JSON and Summary ---")

    # Create a copy for JSON to avoid changing the main dataframe's dtypes
    top_setups_for_json = final_summary_df.head(TOP_N_FOR_DEEP_DIVE).copy()

    # --- FIX: Convert Timestamp columns to strings for JSON serialization ---
    date_cols = ['first_trigger_date', 'last_trigger_date']
    for col in date_cols:
        if col in top_setups_for_json.columns:
            top_setups_for_json[col] = pd.to_datetime(top_setups_for_json[col]).dt.strftime('%Y-%m-%d')

    top_setups_json = top_setups_for_json.to_dict(orient='records')
    with open('top_setups.json', 'w') as f:
        json.dump(top_setups_json, f, indent=2)
    print("Saved 'top_setups.json'")

    print('\nDiscovery complete.')
    print("\nTop Setups by Final Fitness Score (Max Sharpe):")
    display_cols = ['setup_id', 'target_ticker', 'support', 'fitness', 'recency_sharpe', 'description']
    print(final_summary_df[display_cols].head())


Loading raw workbooks…
Raw shape: (1978, 568)

Identified all relevant prefixes/tickers for feature engineering: 33

--- Defining ALL Feature Specifications ---
Defined 3359 total feature specifications.
--- Calculating All Features ---
Calculated 3132 feature series.
Successfully created sequential feature: 'SEQ_VIX_SPIKE_THEN_CORR_DROP'
Successfully created sequential feature: 'SEQ_YIELD_DROP_THEN_GOLD_VOL_SPIKE'
Successfully created sequential feature: 'SEQ_NVDA_VOL_SPIKE_THEN_QQQ_PRICE_RISE'
--- Defining Primitive Signals ---
Defined 10325 primitive signals.

--- GENETIC ALGORITHM: Creating Initial Population (Generation 0) ---
Seeding initial population with diverse feature types...
- Created 5 setups with 'sequential' features.
- Created 5 setups with 'regime' features.
Filling rest of population randomly...
Created initial population of 50 unique setups.

--- Evaluating Generation 1/10 ---
Generation 1 Complete. Best Fitness (Raw Sharpe): 102.21
--- New Hall of Fame Champion Fou

KeyboardInterrupt: 

In [7]:
# Feature Engineering + Discovery Engine with FULLY Migrated Structured Features
import pandas as pd
import numpy as np
import random
import warnings
import functools
import os
import itertools
import json
from joblib import Parallel, delayed

warnings.filterwarnings('ignore')

# --- DEFINITIONS AND CONFIGURATION ---

# --- PRIORITY 1: IMPLEMENT FULL REPRODUCIBILITY ---
# Global seed for all sources of randomness to ensure script reproducibility.
RANDOM_SEED = 44
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
# Note: joblib.Parallel's reproducibility is handled by ensuring the functions it calls
# are deterministic or are themselves seeded, which our new evaluation logic ensures.

# Define the explicit list of tradable tickers
TRADABLE_TICKERS = [
    'QQQ US Equity', 'SPY US Equity', 'XLK US Equity', 'XLF US Equity',
    'XLE US Equity', 'ARKK US Equity', 'VIX Index', 'GLD US Equity',
    'NBIS US Equity', 'LLY US Equity', 'TSLA US Equity', 'AAPL US Equity',
    'NVDA US Equity'
]

# Define list of macro tickers to ensure their inclusion in feature generation
MACRO_TICKERS = [
    'DXY Curncy', 'USGG10YR Index', 'USGG2YR Index', 'CPI YOY Index',
    'INJCJC Index', 'FFA Comdty', 'LF94TRUU Index', 'CPI CHNG Index',
    'NFP TCH Index', 'JOBS US Equity', 'CTII10 Govt', 'USSW10 Curncy',
    'MLCX3CRT Index', 'FARBAST Index', 'BSPGCPUS Index', 'SPCSUSA Index',
    'SPCS20SM Index', 'CONSSENT Index', 'CO1 Comdty'
]

# --- UPDATED FILE PATHS ---
MAIN_DATA_FILE = 'All_tickers_new.xlsx'
MACRO_DATA_FILE = 'Macro_tickers_new.xlsx'

# --- GENETIC ALGORITHM CONFIGURATION ---
NUM_GENERATIONS = 50     # How many evolutionary cycles to run
POPULATION_SIZE = 250     # How many setups (individuals) in each generation
SETUP_LENGTHS_TO_EXPLORE = [2, 3] # Allow setups of 2 or 3 conditions
ELITISM_RATE = 0.1       # Percentage of the best setups to keep untouched for the next generation
MUTATION_RATE = 0.20     # ---GA v2.1: More aggressive mutation rate---
# General Configuration
MIN_INITIAL_SUPPORT_FILTER = 5
OPTION_SIM_HORIZONS_DAYS = [1, 3, 10, 21]
RISK_FREE_RATE = 0.01
RECENCY_WINDOW = 10 # How many recent trades to check for performance decay

# --- END DEFINITIONS AND CONFIGURATION ---


print('Loading raw workbooks…')


# --- MODIFIED Custom Data Loading Function ---
def load_and_merge_excel(file_path, header_row, existing_df=None):
    """Loads an Excel file, prepends sheet names to columns (except Date), and merges."""
    try:
        xls = pd.ExcelFile(file_path)
        current_df = existing_df.copy() if existing_df is not None else None
        for sh_name in xls.sheet_names:
            # Use the specified header_row to correctly read the file
            df = pd.read_excel(xls, sheet_name=sh_name, header=header_row)

            if 'Dates' in df.columns and 'Date' not in df.columns:
                df.rename(columns={'Dates': 'Date'}, inplace=True)
            if 'Date' not in df.columns:
                print(f"Warning: Sheet '{sh_name}' in '{file_path}' is missing a 'Date'/'Dates' column. Skipping sheet.")
                continue
            df.columns = [f"{sh_name}_{col}" if col != 'Date' else col for col in df.columns]
            if current_df is None:
                current_df = df
            else:
                df = df.loc[:,~df.columns.duplicated()]
                current_df = current_df.merge(df, on='Date', how='outer')
        return current_df
    except FileNotFoundError:
        print(f"Error: '{file_path}' not found.")
        return existing_df
    except Exception as e:
        print(f"An unexpected error occurred during Excel loading of '{file_path}': {e}")
        return existing_df


# --- MODIFIED Load main and macro data ---
# Load main tickers file, specifying headers are on Row 2 (index 1)
raw = load_and_merge_excel(MAIN_DATA_FILE, header_row=1)
if raw is not None and not raw.empty:
    # Load macro file, specifying headers are on Row 5 (index 4)
    raw = load_and_merge_excel(MACRO_DATA_FILE, header_row=4, existing_df=raw)
else:
    print("Main data could not be loaded, skipping macro data loading.")
    raw = pd.DataFrame()

# Final cleaning and indexing
if not raw.empty:
    raw = raw.sort_values('Date').reset_index(drop=True)
    raw.fillna(method='ffill', inplace=True)
    if 'Date' in raw.columns:
        raw['Date'] = pd.to_datetime(raw['Date'])
        raw = raw.drop_duplicates(subset=['Date'], keep='last')
        raw.set_index('Date', inplace=True)
        raw.index = pd.to_datetime(raw.index)
        raw.sort_index(inplace=True)
else:
    print("No data loaded. Raw DataFrame is empty.")
    raw = pd.DataFrame()

print('Raw shape:', raw.shape)

# --- Dynamic Ticker Identification ---
all_column_prefixes = sorted(list(set([c.split('_')[0] for c in raw.columns if '_' in c])))
COMMON_FEATURE_PREFIXES = ['Last', 'Open', 'High', 'Low', 'VWAP', 'Volume', 'IVOL', 'Implied', 'Total', '30', '10',
                           '60', 'Hist.', '1st', 'Put', 'Dates', 'CHG', 'FFA', 'INJCJC', 'NFP', 'JOBS', 'CPI', 'CTII10',
                           'LF94TRUU', 'SPX', 'USSW10', 'MLCX3CRT', 'FARBAST', 'BSPGCPUS', 'SPCSUSA', 'SPCS20SM',
                           'CONSSENT']
actual_ticker_prefixes = [p for p in all_column_prefixes if p not in COMMON_FEATURE_PREFIXES]
all_tickers = sorted(list(set(TRADABLE_TICKERS + actual_ticker_prefixes + MACRO_TICKERS)))
print(f'\nIdentified all relevant prefixes/tickers for feature engineering: {len(all_tickers)}')


# --- Helper functions ---
def first_col_containing(ticker_full_name, substr=''):
    if substr == 'PX_LAST':
        for potential_col in [f"{ticker_full_name}_Last_Price_PX_LAST", f"{ticker_full_name}_PX_LAST"]:
            if potential_col in raw.columns: return potential_col
    for c in raw.columns:
        if c.startswith(ticker_full_name) and substr in c: return c
    return None

def safe_series(col_name):
    return raw[col_name] if col_name and col_name in raw.columns else pd.Series(index=raw.index, dtype=float)

def frac_diff(series, d=0.5, window=100):
    weights = [1.];
    for k in range(1, len(series)): weights.append(-weights[-1] * (d - k + 1) / k)
    weights = np.array(weights[::-1]); output = pd.Series(index=series.index, dtype=float)
    for i in range(window, len(series)):
        subset = series.iloc[i - window + 1: i + 1]
        if len(subset) == len(weights[-window:]): output.iloc[i] = np.dot(weights[-window:], subset)
    return output.dropna()

def block_bootstrap_sharpe(returns_series, block_size, num_iterations=1000, annualize=True, trading_days_per_year=252):
    returns_series = returns_series.dropna()
    if len(returns_series) < block_size or len(returns_series) < 2: return 0.0, 0.0, 0.0
    blocks = [returns_series.iloc[i: i + block_size] for i in range(0, len(returns_series), block_size) if not returns_series.iloc[i: i + block_size].empty]
    if not blocks: return 0.0, 0.0, 0.0
    n_blocks_to_sample = int(np.ceil(len(returns_series) / block_size)); sharpes = []
    for _ in range(num_iterations):
        resampled_returns_list = [blocks[i] for i in np.random.choice(len(blocks), n_blocks_to_sample, replace=True)]
        resampled_returns = pd.concat(resampled_returns_list).iloc[:len(returns_series)]
        if resampled_returns.std() > 1e-9:
            sharpes.append((resampled_returns.mean() / resampled_returns.std()) * (np.sqrt(trading_days_per_year) if annualize else 1))
        else: sharpes.append(0.0)
    if not sharpes: return 0.0, 0.0, 0.0
    return np.median(sharpes), np.percentile(sharpes, 5), np.percentile(sharpes, 95)

def calculate_sortino_ratio(returns_series, annualize=True, trading_days_per_year=252):
    """Calculates the Sortino Ratio."""
    returns_series = returns_series.dropna()
    if len(returns_series) < 2: return 0.0

    target_return = 0
    downside_returns = returns_series[returns_series < target_return]

    if len(downside_returns) == 0: return np.inf

    expected_return = returns_series.mean()
    downside_std = downside_returns.std()

    if downside_std == 0: return np.inf

    sortino = (expected_return - target_return) / downside_std
    return sortino * np.sqrt(trading_days_per_year) if annualize else sortino

def calculate_calmar_ratio(returns_series, annualize=True, trading_days_per_year=252):
    """Calculates the Calmar Ratio."""
    returns_series = returns_series.dropna()
    if len(returns_series) < 2: return 0.0

    cumulative_returns = (1 + returns_series).cumprod()
    peak = cumulative_returns.cummax()
    drawdown = (cumulative_returns - peak) / peak
    max_drawdown = drawdown.min()

    if max_drawdown == 0: return np.inf

    total_return = cumulative_returns.iloc[-1] - 1

    # --- FIX: Guard against negative base for power calculation ---
    # If total return is <= -100%, the annualized geometric return is undefined.
    if (1 + total_return) < 0:
        return -99 # Return a large negative number to signify catastrophic performance

    num_years = len(returns_series) / trading_days_per_year
    annualized_return = (1 + total_return)**(1/num_years) - 1

    calmar = annualized_return / abs(max_drawdown)
    return calmar

# --- Option Simulation Helpers ---
def estimate_atm_premium(price, ivol, days, option_type):
    T = days / 365.25
    if T <= 0 or price <= 0 or ivol <= 0: return 0
    return 0.4 * price * ivol * np.sqrt(T)

def simulate_option_pnl_detailed(current_price, future_price, ivol_at_entry, horizon_days, entry_direction):
    underlying_return = (future_price - current_price) / current_price if current_price and pd.notna(current_price) and pd.notna(future_price) else np.nan
    nan_result = {'pnl_per_share': np.nan, 'option_type': None, 'strike_price': np.nan, 'entry_premium': np.nan,
                  'exit_value': np.nan, 'pnl_dollars': np.nan, 'pnl_pct': np.nan, 'skipped_reason': 'None',
                  'Underlying_Exit_Price': future_price if pd.notna(future_price) else np.nan, 'Return_Underlying': underlying_return,}
    if pd.isna(current_price) or current_price <= 0:
        nan_result['skipped_reason'] = 'Invalid Entry Price'; return nan_result
    if pd.isna(ivol_at_entry) or ivol_at_entry <= 0:
        nan_result['skipped_reason'] = 'Invalid IVOL'; return nan_result
    if pd.isna(future_price):
        nan_result['skipped_reason'] = 'Missing Future Price'; return nan_result
    if entry_direction not in ['long', 'short']:
        nan_result['skipped_reason'] = 'Invalid Entry Direction'; return nan_result

    scaled_ivol = ivol_at_entry / 100.0 if ivol_at_entry > 1.0 else ivol_at_entry
    strike_price = current_price
    option_type = 'call' if entry_direction == 'long' else 'put'
    entry_premium = estimate_atm_premium(current_price, scaled_ivol, horizon_days, option_type)

    if option_type == 'call': exit_value = max(future_price - strike_price, 0)
    else: exit_value = max(strike_price - future_price, 0)

    pnl_per_share = exit_value - entry_premium
    pnl_dollars = pnl_per_share * 100

    # Calculate P&L as a percentage of premium paid
    pnl_pct = (pnl_per_share / entry_premium) * 100 if entry_premium > 0 else np.nan

    return {'pnl_per_share': pnl_per_share, 'option_type': option_type, 'strike_price': strike_price,
            'entry_premium': entry_premium, 'exit_value': exit_value, 'pnl_dollars': pnl_dollars,
            'pnl_pct': pnl_pct, 'skipped_reason': 'None', 'Underlying_Exit_Price': future_price,
            'Return_Underlying': underlying_return,}
# --- 1. Define Feature Specifications ---
print('\n--- Defining ALL Feature Specifications ---')
feature_specs = []
# Volatility Features
for ticker in all_tickers:
    f60 = '60_Day_Call_Implied_Volatility'; f10 = '10_Day_Call_Implied_Volatility'
    feature_specs.append({'type': 'ivol_term_structure', 'assets': [ticker], 'params': {'f_long':f60, 'f_short':f10},
                          'unique_id': f'term_structure_{f60}-{f10}__{ticker}', 'display_name': f"diff({f60}, {f10})__{ticker}"})
    put50 = '1st_Month_Put_Imp_Vol_50_Delta'; call40 = '1st_Month_Call_Imp_Vol_40_Delta'
    feature_specs.append({'type': 'ivol_skew', 'assets': [ticker], 'params': {'put':put50, 'call':call40},
                          'unique_id': f'skew_{put50}-{call40}__{ticker}', 'display_name': f"diff({put50}, {call40})__{ticker}"})
    for suffix in ['IVOL_SIGMA', 'CALL_IMP_VOL_30D', 'PUT_IMP_VOL_30D']:
        feature_specs.append({'type': 'ivol_shock', 'assets': [ticker], 'params': {'ivol_suffix': suffix, 'window': 30},
                              'unique_id': f'zscore_{suffix}_30d__{ticker}', 'display_name': f"zscore_{suffix}_30d__{ticker}"})
        feature_specs.append({'type': 'ivol_div_volume', 'assets': [ticker], 'params': {'ivol_suffix': suffix, 'vol_suffix':'VOLUME'},
                              'unique_id': f'div_{suffix}_by_VOLUME__{ticker}', 'display_name': f"div({suffix}, VOLUME)__{ticker}"})
# Deriv Flow & Sentiment Features
for ticker in all_tickers:
    pc_ratio_col = 'PUT_CALL_VOLUME_RATIO_CUR_DAY'
    feature_specs.append({'type': 'put_call_ratio_ema', 'assets': [ticker], 'params': {'span': 5, 'col': pc_ratio_col},
                          'unique_id': f'ema5_{pc_ratio_col}__{ticker}', 'display_name': f"ema5_{pc_ratio_col}__{ticker}"})
    oi_col = 'OPEN_INT_TOTAL_CALL'
    feature_specs.append({'type': 'open_interest_change', 'assets': [ticker], 'params': {'days': 3, 'col': oi_col},
                          'unique_id': f'pct_change_{oi_col}_3d__{ticker}', 'display_name': f"pct_change_{oi_col}_3d__{ticker}"})
    vol_col = 'Volume_-Realtime_VOLUME'
    feature_specs.append({'type': 'volume_zscore', 'assets': [ticker], 'params': {'window': 30, 'col': vol_col},
                          'unique_id': f'zscore_{vol_col}_30d__{ticker}', 'display_name': f"zscore_{vol_col}_30d__{ticker}"})
    sm_oi = 'OPEN_INT_TOTAL_CALL'; sm_ivol='10_Day_Call_Implied_Volatility'
    feature_specs.append({'type': 'smart_money_flag', 'assets': [ticker], 'params': {'oi_col': sm_oi, 'ivol_col': sm_ivol},
                          'unique_id': f'smart_money_{sm_oi}_{sm_ivol}__{ticker}', 'display_name': f"smart_money(pct_change({sm_oi}) > 0 AND pct_change({sm_ivol}) > 0)__{ticker}"})
# Generic Z-Score feature needed for sequential patterns
for ticker in all_tickers:
    for col in ['PX_LAST', 'IVOL_SIGMA', 'Volume_-Realtime_VOLUME']:
        for window in [30, 60]:
            feature_specs.append({'type': 'generic_zscore', 'assets': [ticker], 'params': {'col': col, 'window': window},
                                  'unique_id': f'zscore_{col}_{window}d__{ticker}', 'display_name': f"zscore({col}, {window}d)__{ticker}"})
# Cross-Asset Correlation Features
price_col = 'PX_LAST'
correlation_pairs = list(set(itertools.combinations(all_tickers, 2)))
for t1, t2 in correlation_pairs:
    for window in [20, 60]:
        feature_specs.append({'type': 'correlation', 'assets': [t1, t2], 'params': {'window': window, 'col': price_col},
                              'unique_id': f'corr_{t1}:{price_col}_{t2}:{price_col}_{window}d', 'display_name': f"corr({t1}:{price_col}, {t2}:{price_col}, {window}d)"})
    feature_specs.append({'type': 'correlation_zscore', 'assets': [t1, t2], 'params':{'col':price_col, 'window':60},
                          'unique_id': f'zscore_corr20d_{t1}:{price_col}_{t2}:{price_col}_60d', 'display_name': f"zscore_corr(20d)({t1}:{price_col}, {t2}:{price_col}, 60d)"})
    feature_specs.append({'type': 'correlation_delta', 'assets': [t1, t2], 'params':{'col':price_col},
                          'unique_id': f'corr_delta_{t1}:{price_col}_{t2}:{price_col}', 'display_name': f"corr_delta(20d-60d)({t1}:{price_col}, {t2}:{price_col})"})
    feature_specs.append({'type': 'rolling_beta', 'assets': [t1, t2], 'params': {'window': 60, 'col':price_col},
                          'unique_id': f'beta_{t1}:{price_col}_{t2}:{price_col}_60d', 'display_name': f"beta({t1}:{price_col}, {t2}:{price_col}, 60d)"})
# Advanced Correlations
adv_corr_defs = [
    {'t1': 'QQQ US Equity', 'f1': 'IVOL_SIGMA', 't2': 'SPY US Equity', 'f2': 'IVOL_SIGMA', 'win': 30},
    {'t1': 'TSLA US Equity', 'f1': 'Volume_-Realtime_VOLUME', 't2': 'VIX Index', 'f2': 'IVOL_SIGMA', 'win': 20},
    {'t1': 'CO1 Comdty', 'f1': 'PX_LAST', 't2': 'XLE US Equity', 'f2': 'IVOL_SIGMA', 'win': 30},
    {'t1': 'USGG10YR Index', 'f1': 'PX_LAST', 't2': 'XLF US Equity', 'f2': 'IVOL_SIGMA', 'win': 30}
]
for d in adv_corr_defs:
    feature_specs.append({
        'type': 'advanced_correlation', 'assets': [d['t1'], d['t2']], 'params': {'window': d['win'], 'col1': d['f1'], 'col2': d['f2']},
        'unique_id': f"corr_{d['t1']}:{d['f1']}_{d['t2']}:{d['f2']}_{d['win']}d", 'display_name': f"corr({d['t1']}:{d['f1']}, {d['t2']}:{d['f2']}, {d['win']}d)"})
# Macro Features
feature_specs.extend([
    {'type': 'macro_mpi', 'assets': ['DXY Curncy', 'USGG10YR Index'], 'unique_id': 'macro_mpi', 'display_name': 'Macro Pressure Index'},
    {'type': 'macro_fear_overdrive', 'assets': ['VIX Index', 'DXY Curncy', 'SPY US Equity'], 'unique_id': 'macro_fear_overdrive', 'display_name': 'Fear Overdrive'},
    {'type': 'macro_sector_rotation', 'assets': ['XLK US Equity', 'XLE US Equity'], 'unique_id': 'macro_xlk_xle_rotation', 'display_name': 'Sector Rotation (XLK-XLE)'},
    {'type': 'macro_yield_spread', 'assets': ['USGG10YR Index', 'USGG2YR Index'], 'unique_id': 'macro_10y2y_spread', 'display_name': 'Yield Spread (10Y-2Y)'},
    {'type': 'macro_cpi_zscore', 'assets': ['CPI YOY Index'], 'unique_id': 'macro_cpi_z', 'display_name': 'CPI Z-Score'},
    {'type': 'macro_injcjc_shock', 'assets': ['INJCJC Index'], 'unique_id': 'macro_jobless_claims_shock', 'display_name': 'Jobless Claims Shock'},
    {'type': 'macro_ffa_spread', 'assets': ['FFA Comdty', 'USGG2YR Index'], 'unique_id': 'macro_ffa_spread', 'display_name': 'Fed Funds Spread'},
    {'type': 'macro_lf94truu_vol_signal', 'assets': ['LF94TRUU Index'], 'unique_id': 'macro_hyg_vol_signal', 'display_name': 'HYG Vol Signal'}])
for t in ['CPI YOY Index', 'CPI CHNG Index', 'NFP TCH Index', 'JOBS US Equity']:
    feature_specs.append({'type': 'macro_generic_mom', 'assets': [t], 'params': {'days': 3}, 'unique_id': f'macro_mom3_{t}', 'display_name': f'Macro Mom3d({t})'})
for t in ['CTII10 Govt', 'USSW10 Curncy', 'MLCX3CRT Index', 'FARBAST Index', 'BSPGCPUS Index', 'SPCSUSA Index', 'SPCS20SM Index', 'CONSSENT Index']:
    feature_specs.append({'type': 'macro_generic_chg', 'assets': [t], 'unique_id': f'macro_chg_{t}', 'display_name': f'Macro Chg({t})'})
# Momentum / Volatility Fractal Features
for ticker in all_tickers:
    feature_specs.append({'type': 'mom_div_vol', 'assets': [ticker], 'params': {'price_col':price_col, 'mom_win':5, 'vol_win':20},
                          'unique_id': f'mom_div_vol_{price_col}_5d_20d__{ticker}', 'display_name': f"mom_div_vol({price_col}, 5d, 20d)__{ticker}"})
    feature_specs.append({'type': 'bollinger_pctB', 'assets': [ticker], 'params': {'window': 20, 'price_col':price_col},
                          'unique_id': f'pctB_{price_col}_20d__{ticker}', 'display_name': f"%B({price_col}, 20d)__{ticker}"})
    feature_specs.append({'type': 'fractional_differencing', 'assets': [ticker], 'params': {'d': 0.5, 'window': 100, 'price_col':price_col},
                          'unique_id': f'frac_diff_{price_col}_d0.5_100w__{ticker}', 'display_name': f"frac_diff({price_col}, d=0.5, win=100)__{ticker}"})
# Market Regime and Interaction Features
feature_specs.append({'type': 'regime_filter', 'assets': ['VIX Index'], 'params': {'threshold': 25, 'col': 'PX_LAST'},
                      'unique_id': 'REGIME_IS_HIGH_VOL', 'display_name': 'REGIME_IS_HIGH_VOL (VIX > 25)'})
feature_specs.append({'type': 'interaction', 'assets': [],
                      'params': {'feature1': 'zscore_IVOL_SIGMA_30d__AAPL US Equity', 'feature2': 'REGIME_IS_HIGH_VOL'},
                      'unique_id': 'zscore_IVOL_SIGMA_30d__AAPL US Equity_IN_HIGH_VOL',
                      'display_name': 'zscore(IVOL_SIGMA, 30d)__AAPL US Equity IN_HIGH_VOL'})
print(f"Defined {len(feature_specs)} total feature specifications.")

# --- 2. Calculate Features Based on Specifications ---
print('--- Calculating All Features ---')
feat = pd.DataFrame(index=raw.index)
# The order of calculation matters for interaction features, so iterate twice.
# First pass for all primary features
for spec in feature_specs:
    if spec['type'] == 'interaction': continue # Skip interaction features on the first pass
    feature_id = spec['unique_id']
    try:
        if spec['type'] == 'ivol_term_structure':
            ivol60 = safe_series(first_col_containing(spec['assets'][0], spec['params']['f_long'])); ivol10 = safe_series(first_col_containing(spec['assets'][0], spec['params']['f_short']))
            if not ivol60.empty and not ivol10.empty: feat[feature_id] = ivol60 - ivol10
        elif spec['type'] == 'ivol_skew':
            put50 = safe_series(first_col_containing(spec['assets'][0], spec['params']['put'])); call40 = safe_series(first_col_containing(spec['assets'][0], spec['params']['call']))
            if not put50.empty and not call40.empty: feat[feature_id] = put50 - call40
        elif spec['type'] == 'ivol_shock':
            ivol_s = safe_series(first_col_containing(spec['assets'][0], spec['params']['ivol_suffix']))
            if not ivol_s.empty: feat[feature_id] = (ivol_s.diff() - ivol_s.diff().rolling(spec['params']['window']).mean()) / ivol_s.diff().rolling(spec['params']['window']).std()
        elif spec['type'] == 'ivol_div_volume':
            ivol_s = safe_series(first_col_containing(spec['assets'][0], spec['params']['ivol_suffix'])); vol_s = safe_series(first_col_containing(spec['assets'][0], spec['params']['vol_suffix']))
            if not ivol_s.empty and not vol_s.empty: feat[feature_id] = ivol_s / vol_s.replace(0, np.nan)
        elif spec['type'] == 'put_call_ratio_ema':
            pc = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            if not pc.empty: feat[feature_id] = pc.ewm(span=spec['params']['span'], adjust=False).mean()
        elif spec['type'] == 'open_interest_change':
            oi = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            if not oi.empty: feat[feature_id] = oi.pct_change(spec['params']['days'])
        elif spec['type'] == 'volume_zscore':
            vol = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            if not vol.empty: feat[feature_id] = (vol - vol.rolling(spec['params']['window']).mean()) / vol.rolling(spec['params']['window']).std()
        elif spec['type'] == 'smart_money_flag':
            oi = safe_series(first_col_containing(spec['assets'][0], spec['params']['oi_col'])).pct_change() > 0; ivol = safe_series(first_col_containing(spec['assets'][0], spec['params']['ivol_col'])).pct_change() > 0
            if not oi.empty and not ivol.empty: feat[feature_id] = (oi & ivol).astype(int)
        elif spec['type'] == 'generic_zscore':
            s = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            if not s.empty: feat[feature_id] = (s - s.rolling(spec['params']['window']).mean()) / s.rolling(spec['params']['window']).std()
        elif spec['type'] == 'correlation':
            t1, t2 = spec['assets']; p1, p2 = first_col_containing(t1, spec['params']['col']), first_col_containing(t2, spec['params']['col'])
            if p1 and p2:
                aligned = pd.DataFrame({'s1': safe_series(p1), 's2': safe_series(p2)}).dropna()
                if len(aligned) > spec['params']['window']: feat[feature_id] = aligned['s1'].rolling(spec['params']['window']).corr(aligned['s2'])
        elif spec['type'] == 'advanced_correlation':
            t1, t2 = spec['assets']; s1_col = first_col_containing(t1, spec['params']['col1']); s2_col = first_col_containing(t2, spec['params']['col2'])
            if s1_col and s2_col:
                aligned = pd.DataFrame({'s1': safe_series(s1_col), 's2': safe_series(s2_col)}).dropna()
                if len(aligned) > spec['params']['window']: feat[feature_id] = aligned['s1'].rolling(spec['params']['window']).corr(aligned['s2'])
        elif spec['type'] in ['correlation_zscore', 'correlation_delta']:
            t1, t2 = spec['assets']; price_col_name = spec['params']['col']; c20_id = f'corr_{t1}:{price_col_name}_{t2}:{price_col_name}_20d'; c60_id = f'corr_{t1}:{price_col_name}_{t2}:{price_col_name}_60d'
            c20 = feat.get(c20_id); c60 = feat.get(c60_id)
            if c20 is not None and c60 is not None:
                if spec['type'] == 'correlation_zscore': feat[feature_id] = (c20 - c20.rolling(spec['params']['window']).mean()) / c20.rolling(spec['params']['window']).std()
                else: feat[feature_id] = c20 - c60
        elif spec['type'] == 'rolling_beta':
            t1, t2 = spec['assets']; p1, p2 = first_col_containing(t1, spec['params']['col']), first_col_containing(t2, spec['params']['col'])
            if p1 and p2:
                rets = pd.DataFrame({'r1': safe_series(p1).pct_change(), 'r2': safe_series(p2).pct_change()}).dropna()
                if len(rets) > spec['params']['window']: feat[feature_id] = rets['r1'].rolling(spec['params']['window']).cov(rets['r2']) / rets['r2'].rolling(spec['params']['window']).var()
        elif spec['type'] == 'macro_mpi':
            dxy, ust10 = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')), safe_series(first_col_containing(spec['assets'][1], 'PX_LAST'))
            if not dxy.empty and not ust10.empty: feat[feature_id] = dxy.pct_change().rolling(3).sum() + ust10.pct_change().rolling(3).sum()
        elif spec['type'] == 'macro_fear_overdrive':
            vix, dxy, spy = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')), safe_series(first_col_containing(spec['assets'][1], 'PX_LAST')), safe_series(first_col_containing(spec['assets'][2], 'PX_LAST'))
            if not vix.empty and not dxy.empty and not spy.empty: feat[feature_id] = ((vix > 20) & (dxy.pct_change() > 0) & (spy < spy.rolling(20).mean())).astype(int)
        elif spec['type'] == 'macro_sector_rotation':
            xlk, xle = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')), safe_series(first_col_containing(spec['assets'][1], 'PX_LAST'))
            if not xlk.empty and not xle.empty: feat[feature_id] = xlk.pct_change(5) - xle.pct_change(5)
        elif spec['type'] == 'macro_yield_spread':
            ust10, ust2 = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')), safe_series(first_col_containing(spec['assets'][1], 'PX_LAST'))
            if not ust10.empty and not ust2.empty: feat[feature_id] = ust10 - ust2
        elif spec['type'] == 'macro_generic_mom':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            if not px.empty: feat[feature_id] = px.pct_change(spec['params']['days'])
        elif spec['type'] == 'macro_generic_chg':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            if not px.empty: feat[feature_id] = px.pct_change()
        elif spec['type'] == 'macro_cpi_zscore':
            cpi = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            if not cpi.empty: feat[feature_id] = (cpi - cpi.rolling(12).mean()) / cpi.rolling(12).std()
        elif spec['type'] == 'macro_injcjc_shock':
            injcjc = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            if not injcjc.empty: feat[feature_id] = (injcjc.diff() > injcjc.diff().rolling(20).std() * 2).astype(int)
        elif spec['type'] == 'macro_ffa_spread':
            ffa, ust2 = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')), safe_series(first_col_containing(spec['assets'][1], 'PX_LAST'))
            if not ffa.empty and not ust2.empty: feat[feature_id] = ffa - ust2
        elif spec['type'] == 'macro_lf94truu_vol_signal':
            vol = safe_series(first_col_containing(spec['assets'][0], 'VOLATILITY_30D'))
            if not vol.empty: feat[feature_id] = vol / vol.rolling(60).mean()
        elif spec['type'] == 'regime_filter':
            px = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            if not px.empty: feat[feature_id] = px > spec['params']['threshold']
        elif spec['type'] == 'mom_div_vol':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            if not px.empty: feat[feature_id] = px.pct_change(5) / px.pct_change().rolling(20).std()
        elif spec['type'] == 'bollinger_pctB':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            if not px.empty:
                ma = px.rolling(spec['params']['window']).mean();
                std = px.rolling(spec['params']['window']).std()
                feat[feature_id] = (px - (ma - 2 * std)) / (4 * std)
        elif spec['type'] == 'fractional_differencing':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            if not px.empty and len(px) > spec['params']['window']: feat[feature_id] = frac_diff(px, d=spec['params']['d'], window=spec['params']['window'])
    except Exception as e:
        print(f"Could not calculate feature '{feature_id}': {e}")
# Second pass for interaction features
for spec in feature_specs:
    if spec['type'] == 'interaction':
        feature_id = spec['unique_id']
        try:
            f1 = spec['params']['feature1']; f2 = spec['params']['feature2']
            if f1 in feat.columns and f2 in feat.columns:
                feat[feature_id] = feat[f1] * feat[f2]
        except Exception as e:
            print(f"Could not calculate interaction feature '{feature_id}': {e}")
feat = feat.shift(1)
print(f"Calculated {feat.shape[1]} feature series.")

# Sequential Features
try:
    vix_vol_zscore_feat_name = 'zscore_IVOL_SIGMA_30d__VIX Index'; qqq_spy_corr_zscore_feat_name = 'zscore_corr20d_QQQ US Equity:PX_LAST_SPY US Equity:PX_LAST_60d'
    event_A_series = (feat[vix_vol_zscore_feat_name] > 1.5); event_B_series = (feat[qqq_spy_corr_zscore_feat_name] < -1.5)
    sequential_feature_name = 'SEQ_VIX_SPIKE_THEN_CORR_DROP'; feat[sequential_feature_name] = event_B_series & event_A_series.shift(1)
    print(f"Successfully created sequential feature: '{sequential_feature_name}'")
    yield_zscore_name = 'zscore_PX_LAST_60d__USGG10YR Index'; gold_vol_zscore_name = 'zscore_IVOL_SIGMA_30d__GLD US Equity'
    event_A_series = (feat[yield_zscore_name] < -1.5); event_B_series = (feat[gold_vol_zscore_name] > 1.5)
    sequential_feature_name = 'SEQ_YIELD_DROP_THEN_GOLD_VOL_SPIKE'; feat[sequential_feature_name] = event_B_series & event_A_series.shift(1)
    print(f"Successfully created sequential feature: '{sequential_feature_name}'")
    nvda_vol_zscore_name = 'zscore_Volume_-Realtime_VOLUME_30d__NVDA US Equity'; qqq_price_zscore_name = 'zscore_PX_LAST_60d__QQQ US Equity'
    event_A_series = (feat[nvda_vol_zscore_name] > 1.5); event_B_series = (feat[qqq_price_zscore_name] > 1.5)
    sequential_feature_name = 'SEQ_NVDA_VOL_SPIKE_THEN_QQQ_PRICE_RISE'; feat[sequential_feature_name] = event_B_series & event_A_series.shift(1)
    print(f"Successfully created sequential feature: '{sequential_feature_name}'")
except KeyError as e:
    print(f"Warning: Could not create sequential feature. A component feature was not found: {e}")

# --- 3. Define Primitive Signals from Features ---
print('--- Defining Primitive Signals ---')
primitive_signals = []
signal_series = {}
signal_id_counter = 0
for feature_id in feat.columns:
    s = feat[feature_id].replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty: continue

    # --- FIX: Robust check for boolean-like features ---
    # This checks if the series contains only values equivalent to True/False or 1/0.
    is_boolean_like = set(s.unique()).issubset({0, 1, True, False})

    if is_boolean_like:
        if s.std() == 0: continue # Skip if all values are the same (e.g., all True or all False)
        sig_id = f"SIG_{signal_id_counter}"; signal_id_counter += 1
        primitive_signals.append({'signal_id': sig_id, 'feature_id': feature_id, 'condition_type': 'boolean', 'operator': '==', 'value': True})
        signal_series[sig_id] = (s == True)
        continue # IMPORTANT: Skip numeric signal generation for boolean features

    if s.std() == 0: continue # Skip non-boolean features with no variance

    # --- Generate percentile signals ---
    for op, val in [('>', 0.8), ('<', 0.2)]:
        sig_id = f"SIG_{signal_id_counter}"; signal_id_counter += 1
        primitive_signals.append({'signal_id': sig_id, 'feature_id': feature_id, 'condition_type': 'percentile', 'operator': op, 'value': val})
        signal_series[sig_id] = s.rank(pct=True).apply(lambda x: x > val if op == '>' else x < val)

    # --- Generate z-score signals ---
    rolling_std = s.rolling(60).std()
    valid_std_mask = rolling_std > 1e-9
    if not valid_std_mask.any(): continue

    z = pd.Series(np.nan, index=s.index)
    z[valid_std_mask] = (s - s.rolling(60).mean())[valid_std_mask] / rolling_std[valid_std_mask]

    for op, val in [('>', 1.5), ('<', -1.5)]:
        sig_id = f"SIG_{signal_id_counter}"; signal_id_counter += 1
        primitive_signals.append({'signal_id': sig_id, 'feature_id': feature_id, 'condition_type': 'z_score', 'operator': op, 'value': val})
        signal_series[sig_id] = z.apply(lambda x: x > val if op == '>' else x < val)

print(f"Defined {len(primitive_signals)} primitive signals.")

# --- Prepare Returns for Evaluation ---
price_cols_for_returns = [first_col_containing(t, 'PX_LAST') for t in TRADABLE_TICKERS if first_col_containing(t, 'PX_LAST')]
prices = raw[price_cols_for_returns].copy()
returns = {h: prices.pct_change(h).shift(-h) for h in [1, 3, 5, 10, 21]}

# --- 4. GENETIC ALGORITHM: Evolve Powerful Setups ---

# --- GENETIC ALGORITHM HELPERS ---
def get_setup_dna(setup):
    """Creates a unique, hashable identifier for a setup based on its signals."""
    return tuple(sorted([s['signal_id'] for s in setup['signal_definitions']]))

def crossover(parent1, parent2):
    """Creates a new child setup by combining DNA from two parents."""
    child_signals = [random.choice(parent1['signal_definitions']), random.choice(parent2['signal_definitions'])]
    # Combine signals from both parents for potentially larger setups
    if len(parent1['signal_definitions']) > 1 and len(parent2['signal_definitions']) > 1:
        child_signals.append(random.choice(parent1['signal_definitions']))
        child_signals.append(random.choice(parent2['signal_definitions']))
    # Ensure no duplicate signals in the child and respect max length
    child_signals = list({s['signal_id']: s for s in child_signals}.values())
    if len(child_signals) > max(SETUP_LENGTHS_TO_EXPLORE):
        child_signals = random.sample(child_signals, max(SETUP_LENGTHS_TO_EXPLORE))
    # Child is a dictionary, not a DataFrame row yet
    return {'id': 'child', 'signal_definitions': child_signals}


def mutate(setup, all_signal_ids, mutation_rate):
    """Randomly changes one signal in a setup's DNA."""
    if random.random() < mutation_rate:
        idx_to_mutate = random.randint(0, len(setup['signal_definitions']) - 1)
        new_signal_id = random.choice(all_signal_ids)
        new_signal_def = next(p for p in primitive_signals if p['signal_id'] == new_signal_id)
        # Avoid mutating to a signal that's already in the setup
        if new_signal_def['signal_id'] not in [s['signal_id'] for s in setup['signal_definitions']]:
            setup['signal_definitions'][idx_to_mutate] = new_signal_def
    return setup

# --- PRIORITY 2: NSGA-II Core Logic Helpers ---
def non_dominated_sort(population):
    """
    Performs non-dominated sorting on the population.
    Assigns a 'rank' to each individual (setup).
    """
    for ind1 in population:
        ind1['domination_count'] = 0
        ind1['dominated_solutions'] = []
        for ind2 in population:
            if ind1 is ind2:
                continue
            # Objectives: Higher Sortino, Higher Calmar, Higher Support are better
            # An individual `p` dominates `q` if it is no worse in all objectives and strictly better in at least one.
            is_dominant = (
                (ind1['objectives'][0] >= ind2['objectives'][0] and
                 ind1['objectives'][1] >= ind2['objectives'][1] and
                 ind1['objectives'][2] >= ind2['objectives'][2]) and
                (ind1['objectives'][0] > ind2['objectives'][0] or
                 ind1['objectives'][1] > ind2['objectives'][1] or
                 ind1['objectives'][2] > ind2['objectives'][2])
            )
            if is_dominant:
                ind1['dominated_solutions'].append(ind2)
            elif (
                (ind2['objectives'][0] >= ind1['objectives'][0] and
                 ind2['objectives'][1] >= ind1['objectives'][1] and
                 ind2['objectives'][2] >= ind1['objectives'][2]) and
                (ind2['objectives'][0] > ind1['objectives'][0] or
                 ind2['objectives'][1] > ind1['objectives'][1] or
                 ind2['objectives'][2] > ind1['objectives'][2])
            ):
                ind1['domination_count'] += 1

    fronts = []
    rank = 1
    front1 = [ind for ind in population if ind['domination_count'] == 0]
    for ind in front1:
        ind['rank'] = rank

    current_front = front1
    while current_front:
        fronts.append(current_front)
        next_front = []
        for ind1 in current_front:
            for ind2 in ind1['dominated_solutions']:
                ind2['domination_count'] -= 1
                if ind2['domination_count'] == 0:
                    ind2['rank'] = rank + 1
                    next_front.append(ind2)
        rank += 1
        current_front = next_front

    # Flatten the list of fronts back into a single population list
    sorted_population = [ind for front in fronts for ind in front]
    return sorted_population

def calculate_crowding_distance(front):
    """
    Calculates the crowding distance for each individual in a Pareto front.
    """
    if not front:
        return

    num_objectives = len(front[0]['objectives'])
    for ind in front:
        ind['crowding_distance'] = 0

    for i in range(num_objectives):
        # Sort by the current objective
        front.sort(key=lambda x: x['objectives'][i])
        # Assign infinite distance to boundary solutions
        front[0]['crowding_distance'] = float('inf')
        front[-1]['crowding_distance'] = float('inf')

        obj_min = front[0]['objectives'][i]
        obj_max = front[-1]['objectives'][i]

        if obj_max == obj_min:
            continue

        for j in range(1, len(front) - 1):
            front[j]['crowding_distance'] += (front[j + 1]['objectives'][i] - front[j - 1]['objectives'][i]) / (obj_max - obj_min)

def selection_operator(population, k=2):
    """
    Selects a parent using binary tournament selection based on rank and crowding distance.
    """
    tournament_contenders = random.sample(population, k)
    contender1, contender2 = tournament_contenders[0], tournament_contenders[1]

    # Lower rank is better
    if contender1['rank'] < contender2['rank']:
        return contender1
    elif contender2['rank'] < contender1['rank']:
        return contender2
    else:
        # If ranks are equal, higher crowding distance is better (more diversity)
        if contender1['crowding_distance'] > contender2['crowding_distance']:
            return contender1
        else:
            return contender2

# --- English Description Generator ---
def generate_english_description(setup_id, signal_defs, feature_specs_list):
    clauses = [];
    for s_def in signal_defs:
        feat_name = next((f_spec['display_name'] for f_spec in feature_specs_list if f_spec['unique_id'] == s_def['feature_id']), s_def.get('feature_id', 'unknown_feature'))
        if s_def['condition_type'] == 'boolean': clauses.append(f"{feat_name} is true")
        elif s_def['condition_type'] == 'percentile':
            level = "is very high" if s_def['operator'] == '>' else "is very low"; clauses.append(f"{feat_name} {level}")
        else:
            level = "surges unexpectedly" if s_def['operator'] == '>' else "drops sharply"; clauses.append(f"{feat_name} {level}")
    description = f"When {clauses[0]}"
    if len(clauses) > 1: description += f" and {' and '.join(clauses[1:])}"
    direction_score = sum(1 if s['operator'] == '>' else -1 for s in signal_defs); bias = 'a bullish' if direction_score > 0 else 'a bearish' if direction_score < 0 else 'an uncertain'
    description += f", it may indicate {bias} outlook."
    return {'setup_id': setup_id, 'description': description, 'explained_description': "DEPRECATED"}

# === START BLOCK TO REPLACE: evaluate_one_setup function ===
def evaluate_one_setup(setup, returns_dict):
    """
    Evaluates a single setup and now ALSO RETURNS THE TRIGGER DATES for phenotype hashing.
    """
    sid, signal_defs = setup['id'], setup['signal_definitions']
    try:
        mask = functools.reduce(lambda a, b: a & b, [signal_series[s['signal_id']] for s in signal_defs])
        dates = mask[mask].index
    except (KeyError, TypeError):
        return {'id': sid, 'signal_definitions': signal_defs, 'objectives': (-99, -99, 0), 'metrics_by_ticker': {}, 'trigger_dates': pd.Index([])}

    support = len(dates)
    if support < MIN_INITIAL_SUPPORT_FILTER:
        return {'id': sid, 'signal_definitions': signal_defs, 'objectives': (-99, -99, support), 'metrics_by_ticker': {}, 'trigger_dates': pd.Index([])}

    direction_score = sum(1 if s['operator'] == '>' else -1 for s in signal_defs if s['condition_type'] != 'boolean')
    if direction_score == 0 and any(s['condition_type'] != 'boolean' for s in signal_defs):
        return {'id': sid, 'signal_definitions': signal_defs, 'objectives': (-99, -99, support), 'metrics_by_ticker': {}, 'trigger_dates': dates}
    elif all(s['condition_type'] == 'boolean' for s in signal_defs):
        direction_score = 1
    entry_direction = 'long' if direction_score > 0 else 'short'

    price_cols_for_returns = [first_col_containing(t, 'PX_LAST') for t in TRADABLE_TICKERS if first_col_containing(t, 'PX_LAST')]
    all_sortinos = []
    all_calmars = []
    metrics_by_ticker = {}
    perf_horizon = 10

    for tk_col in price_cols_for_returns:
        tk_symbol = next((ticker for ticker in TRADABLE_TICKERS if tk_col.startswith(ticker)), "Unknown")
        r_ticker = returns_dict[perf_horizon][tk_col].reindex(dates).dropna()

        if entry_direction == 'short':
            r_ticker = -r_ticker

        ### FIX: Changed from > 5 to >= 5 to be consistent with the seeder's viability check. ###
        if len(r_ticker) >= 5 and r_ticker.std() > 1e-9:
            sortino = calculate_sortino_ratio(r_ticker)
            calmar = calculate_calmar_ratio(r_ticker)
            all_sortinos.append(sortino)
            all_calmars.append(calmar)
            metrics_by_ticker[tk_symbol] = {'sortino': sortino, 'calmar': calmar}

    median_sortino = np.median(all_sortinos) if all_sortinos else -99
    median_calmar = np.median(all_calmars) if all_calmars else -99
    median_sortino = np.nan_to_num(median_sortino, nan=-99.0, posinf=999.0, neginf=-999.0)
    median_calmar = np.nan_to_num(median_calmar, nan=-99.0, posinf=999.0, neginf=-999.0)

    return {
        'id': sid,
        'signal_definitions': signal_defs,
        'objectives': (median_sortino, median_calmar, support),
        'metrics_by_ticker': metrics_by_ticker,
        'entry_direction': entry_direction,
        'trigger_dates': dates,
        'first_trigger_date': dates.min() if not dates.empty else pd.NaT,
        'last_trigger_date': dates.max() if not dates.empty else pd.NaT
    }
# === END BLOCK TO REPLACE: evaluate_one_setup function ===

# === START BLOCK TO REPLACE: The entire section from `Create Initial Population` to the end of the script ===
# --- PRIORITY 3: Robust Initial Population Generator ---
# --- PRIORITY 3: Robust Initial Population Generator (v3 - Pre-Validation) ---
print('\n--- GENETIC ALGORITHM: Creating Initial Population (Generation 0) ---')
all_signal_ids = [s['signal_id'] for s in primitive_signals]
current_population = []
setup_id_counter = 0
existing_dna = set()
perf_horizon = 10 # Use the same horizon as the main evaluation for consistency

def is_setup_viable(signal_defs, min_trades=5):
    """A lightweight pre-evaluation function to check if a setup is truly viable."""
    try:
        mask = functools.reduce(lambda a, b: a & b, [signal_series[s['signal_id']] for s in signal_defs])
        dates = mask[mask].index
        if len(dates) < MIN_INITIAL_SUPPORT_FILTER:
            return False

        # Check if at least one ticker has enough valid return data points
        for tk_col in price_cols_for_returns:
            r_ticker = returns[perf_horizon][tk_col].reindex(dates).dropna()
            if len(r_ticker) >= min_trades:
                return True # Found a viable ticker, so the setup is viable
        return False # No tickers were viable for this setup
    except (KeyError, TypeError):
        return False

# Stage 1 & 2: Seed with single signals and simple pairs that pass the viability test
print("Seeding with pre-validated single and pair setups...")
num_to_create = int(POPULATION_SIZE * 0.8) # Let's create more high-quality seeds
max_attempts = len(primitive_signals) * 5
attempts = 0

# Try creating single-signal setups first
for p_signal in primitive_signals:
    if len(current_population) >= num_to_create: break
    if is_setup_viable([p_signal]):
        setup = {'id': f'S{setup_id_counter:04d}', 'signal_definitions': [p_signal]}
        dna = get_setup_dna(setup)
        if dna not in existing_dna:
            current_population.append(setup)
            existing_dna.add(dna)
            setup_id_counter += 1

# Try creating paired setups
while len(current_population) < num_to_create and attempts < max_attempts:
    attempts += 1
    p_signal_1 = random.choice(primitive_signals)
    p_signal_2 = random.choice(primitive_signals)
    if p_signal_1['signal_id'] == p_signal_2['signal_id']: continue

    sig_defs = [p_signal_1, p_signal_2]
    if is_setup_viable(sig_defs):
        setup = {'id': f'S{setup_id_counter:04d}', 'signal_definitions': sig_defs}
        dna = get_setup_dna(setup)
        if dna not in existing_dna:
            current_population.append(setup)
            existing_dna.add(dna)
            setup_id_counter += 1

print(f"  - Created {len(current_population)} pre-validated setups.")

# Stage 3: Fill the remainder randomly, still using the viability test
print("Filling remainder of population with pre-validated random setups...")
max_attempts = POPULATION_SIZE * 100
attempts = 0
while len(current_population) < POPULATION_SIZE and attempts < max_attempts:
    attempts += 1
    k = random.choice(SETUP_LENGTHS_TO_EXPLORE)
    sig_id_list = random.sample(all_signal_ids, k)
    sig_defs = [p for p in primitive_signals if p['signal_id'] in sig_id_list]

    dna = get_setup_dna({'signal_definitions': sig_defs})
    if dna in existing_dna: continue

    if is_setup_viable(sig_defs):
        temp_setup = {'id': f'S{setup_id_counter:04d}', 'signal_definitions': sig_defs}
        current_population.append(temp_setup)
        existing_dna.add(dna)
        setup_id_counter += 1

if attempts >= max_attempts:
    print(f"Warning: Population filling stopped after {max_attempts} attempts.")

if not current_population:
    raise SystemExit("FATAL: Could not create any viable setups for the initial population. Check data or filter criteria.")

print(f"Created initial population of {len(current_population)} guaranteed viable setups.")

# --- The Main Evolutionary Loop (NSGA-II) ---
hall_of_fame = []

for generation in range(NUM_GENERATIONS):
    print(f"\n--- Evaluating Generation {generation + 1}/{NUM_GENERATIONS} ---")

    # Evaluate the current population
    evaluated_population = Parallel(n_jobs=-1)(delayed(evaluate_one_setup)(setup, returns) for setup in current_population)

    # For the first generation, the combined population is just the initial evaluated population
    combined_population = evaluated_population

    # After the first generation, create children and add them to the population
    if generation > 0:
        children = []
        ranked_population = non_dominated_sort(evaluated_population)
        front_num = 1
        while True:
            current_front = [ind for ind in ranked_population if ind.get('rank') == front_num]
            if not current_front: break
            calculate_crowding_distance(current_front)
            front_num += 1

        # Ensure we have a valid population to select from
        if ranked_population:
            while len(children) < POPULATION_SIZE:
                parent1 = selection_operator(ranked_population)
                parent2 = selection_operator(ranked_population)
                child = crossover(parent1, parent2)
                child = mutate(child, all_signal_ids, MUTATION_RATE)
                child['id'] = f'S{setup_id_counter:04d}'
                setup_id_counter += 1
                children.append(child)

            evaluated_children = Parallel(n_jobs=-1)(delayed(evaluate_one_setup)(setup, returns) for setup in children)
            combined_population = evaluated_population + evaluated_children

    ### FIX: Apply the strict phenotype filter ONLY AFTER the first generation ###
    if generation > 0 and combined_population:
        phenotype_dict = {}
        for ind in combined_population:
            trigger_dates = ind.get('trigger_dates')
            if trigger_dates is None or trigger_dates.empty:
                continue
            fingerprint = tuple(trigger_dates)
            if fingerprint not in phenotype_dict:
                phenotype_dict[fingerprint] = ind
        unique_phenotype_population = list(phenotype_dict.values())
    else:
        # For Gen 0, we allow functional duplicates to ensure a healthy gene pool for creating offspring
        unique_phenotype_population = combined_population

    if not unique_phenotype_population:
        print("Population extinct. No valid individuals to select for the next generation. Stopping.")
        break

    # NSGA-II Selection for the next generation
    sorted_population = non_dominated_sort(unique_phenotype_population)
    next_generation_population = []
    front_num = 1
    while len(next_generation_population) < POPULATION_SIZE:
        current_front = [ind for ind in sorted_population if ind['rank'] == front_num]
        if not current_front:
            break

        calculate_crowding_distance(current_front)
        if len(next_generation_population) + len(current_front) <= POPULATION_SIZE:
            next_generation_population.extend(current_front)
        else:
            current_front.sort(key=lambda x: x['crowding_distance'], reverse=True)
            num_needed = POPULATION_SIZE - len(next_generation_population)
            next_generation_population.extend(current_front[:num_needed])
        front_num += 1

    # If the next generation is empty, stop the process.
    if not next_generation_population:
        print(f"Warning: Could not form next generation from {len(unique_phenotype_population)} unique individuals. Stopping.")
        break

    current_population = next_generation_population

    # Update Hall of Fame
    current_best_front = [ind for ind in sorted_population if ind['rank'] == 1]

    if current_best_front:
        hall_of_fame_candidates = non_dominated_sort(hall_of_fame + current_best_front)
        hof_phenotype_dict = {}
        for ind in hall_of_fame_candidates:
            trigger_dates = ind.get('trigger_dates')
            if trigger_dates is None or trigger_dates.empty: continue
            fingerprint = tuple(trigger_dates)
            if fingerprint not in hof_phenotype_dict:
                 hof_phenotype_dict[fingerprint] = ind
        hall_of_fame = [ind for ind in hof_phenotype_dict.values() if ind['rank'] == 1]

    # Report generation statistics
    if hall_of_fame:
        hall_of_fame.sort(key=lambda x: x['objectives'][0], reverse=True)
        best_of_gen = hall_of_fame[0]
        print(f"Generation {generation + 1} Complete. Unique Phenotypes: {len(unique_phenotype_population)}. Hall of Fame: {len(hall_of_fame)}. Best: (S:{best_of_gen['objectives'][0]:.2f}, C:{best_of_gen['objectives'][1]:.2f}, Sup:{best_of_gen['objectives'][2]})")
    else:
        print(f"Generation {generation + 1} Complete. No valid solutions in Hall of Fame.")
# --- PRIORITY 4: Final Evaluation and Multi-Objective Report ---
print("\n--- Genetic Algorithm Complete. Generating Final Report from Hall of Fame ---")

if not hall_of_fame:
    print("Discovery complete. The Hall of Fame is empty; no valid setups were found.")
else:
    ### FIX: Create a final, unique list of solutions to process. ###
    # This is a robust, "belt-and-suspenders" approach to ensure no duplicates make it to the final report.
    final_pareto_front_df = pd.DataFrame(hall_of_fame)
    final_pareto_front_df.drop_duplicates(subset=['id'], keep='first', inplace=True)
    final_pareto_front = final_pareto_front_df.to_dict('records')

    print(f"Final unique Pareto Front contains {len(final_pareto_front)} non-dominated solutions.")

    print(f"\n--- Performing deep dive on {len(final_pareto_front)} unique solutions ---")
    all_trade_ledger_rows = []
    all_description_records = []
    summary_rows = []

    # Iterate over the cleaned, unique list of solutions
    for setup_solution in final_pareto_front:
        setup_id = setup_solution['id']
        setup_def = setup_solution['signal_definitions']

        all_description_records.append(generate_english_description(setup_id, setup_def, feature_specs))

        mask = functools.reduce(lambda a, b: a & b, [signal_series[s['signal_id']] for s in setup_def])
        dates = mask[mask].index

        best_ticker = "N/A"
        best_sortino = -999
        if setup_solution.get('metrics_by_ticker'):
            for ticker, metrics in setup_solution['metrics_by_ticker'].items():
                if metrics.get('sortino', -999) > best_sortino:
                    best_sortino = metrics['sortino']
                    best_ticker = ticker

        recency_sharpe = np.nan
        if len(dates) >= RECENCY_WINDOW:
            recent_dates = dates[-RECENCY_WINDOW:]
            h = 10
            recent_sharpes = []
            for tk_symbol in TRADABLE_TICKERS:
                 price_col_name = first_col_containing(tk_symbol, 'PX_LAST')
                 if price_col_name:
                    r_recent = returns[h][price_col_name].reindex(recent_dates).dropna()
                    if setup_solution['entry_direction'] == 'short': r_recent = -r_recent
                    if r_recent.std() > 1e-9 and len(r_recent) > 2:
                        recent_sharpes.append((r_recent.mean() / r_recent.std()) * np.sqrt(252/h))
            if recent_sharpes:
                recency_sharpe = np.nanmedian(recent_sharpes)

        summary_rows.append({
            'setup_id': setup_id,
            'rank': setup_solution['rank'],
            'best_performing_ticker': best_ticker,
            'obj_sortino': setup_solution['objectives'][0],
            'obj_calmar': setup_solution['objectives'][1],
            'obj_support': setup_solution['objectives'][2],
            'entry_direction': setup_solution['entry_direction'],
            'first_trigger_date': setup_solution.get('first_trigger_date'),
            'last_trigger_date': setup_solution.get('last_trigger_date'),
            'recency_sharpe': recency_sharpe,
        })

        # Build Trade Ledger for this specific setup
        for tk_symbol in TRADABLE_TICKERS:
            tk_col = first_col_containing(tk_symbol, 'PX_LAST')
            if not tk_col: continue

            ivol_col = (first_col_containing(tk_symbol, '30_Day_Call_Implied_Volatility') or first_col_containing(tk_symbol, 'IVOL_SIGMA'))
            ivol_series = raw[ivol_col].reindex(dates) if ivol_col and ivol_col in raw.columns else pd.Series(np.nan, index=dates)
            entry_px_series = raw[tk_col].reindex(dates)

            for d in dates:
                entry_px = entry_px_series.loc[d]
                ivol = ivol_series.loc[d] if not ivol_series.empty and pd.notna(d) and d in ivol_series.index else np.nan
                for h_opt in OPTION_SIM_HORIZONS_DAYS:
                    exit_date = d + pd.Timedelta(days=h_opt)
                    future_px_series = raw.loc[raw.index >= exit_date, tk_col]
                    final_exit_px = future_px_series.iloc[0] if not future_px_series.empty else np.nan
                    pnl_detail = simulate_option_pnl_detailed(entry_px, final_exit_px, ivol, h_opt, setup_solution['entry_direction'])
                    all_trade_ledger_rows.append({'setup_id': setup_id, 'trigger_date': d, 'target_ticker': tk_symbol, 'horizon_days': h_opt, **pnl_detail})

    # --- Final Assembly and Output ---
    summary_df = pd.DataFrame(summary_rows)
    trade_ledger_df = pd.DataFrame(all_trade_ledger_rows)
    description_df = pd.DataFrame(all_description_records).drop_duplicates(subset=['setup_id'])

    if not trade_ledger_df.empty:
        for h_opt in OPTION_SIM_HORIZONS_DAYS:
            pnl_dollars_map = trade_ledger_df[trade_ledger_df['horizon_days'] == h_opt].groupby('setup_id')['pnl_dollars'].mean()
            pnl_pct_map = trade_ledger_df[trade_ledger_df['horizon_days'] == h_opt].groupby('setup_id')['pnl_pct'].mean()
            summary_df[f'avg_option_pnl_dollars_{h_opt}d'] = summary_df['setup_id'].map(pnl_dollars_map)
            summary_df[f'avg_option_pnl_pct_{h_opt}d'] = summary_df['setup_id'].map(pnl_pct_map)

    numeric_cols = summary_df.select_dtypes(include=np.number).columns
    summary_df[numeric_cols] = summary_df[numeric_cols].round(4)
    if not trade_ledger_df.empty:
        trade_ledger_df = trade_ledger_df.round({'pnl_pct': 4, 'pnl_dollars': 2})

    final_summary_df = pd.merge(summary_df, description_df[['setup_id', 'description']], on='setup_id', how='left')
    final_summary_df.sort_values(by=['obj_sortino', 'obj_calmar', 'obj_support'], ascending=[False, False, False], inplace=True)

    print('\n--- Generating Final Output Files ---')
    final_summary_df.to_csv('pareto_front_summary.csv', index=False)
    print("Saved 'pareto_front_summary.csv'")
    trade_ledger_df.to_csv('pareto_front_trade_ledger.csv', index=False)
    print("Saved 'pareto_front_trade_ledger.csv'")

    print("\n--- Generating Final JSON and Summary ---")
    top_setups_for_json = final_summary_df.copy()
    top_setups_for_json.replace({np.nan: None, pd.NaT: None}, inplace=True)
    date_cols = ['first_trigger_date', 'last_trigger_date']
    for col in date_cols:
        if col in top_setups_for_json.columns:
            top_setups_for_json[col] = pd.to_datetime(top_setups_for_json[col], errors='coerce').dt.strftime('%Y-%m-%d')
            top_setups_for_json[col].replace({pd.NaT: None}, inplace=True)
    top_setups_json = top_setups_for_json.to_dict(orient='records')
    with open('pareto/pareto_front_setups.json', 'w') as f:
        json.dump(top_setups_json, f, indent=2)
    print("Saved 'pareto_front_setups.json'")

    print('\nDiscovery complete.')
    print("\nSolutions on the Final Pareto Front (sorted by Sortino):")
    display_cols = ['setup_id', 'rank', 'best_performing_ticker', 'obj_sortino', 'obj_calmar', 'obj_support', 'recency_sharpe', 'description']
    print(final_summary_df[display_cols].head(15).to_string())

Loading raw workbooks…
Raw shape: (1978, 568)

Identified all relevant prefixes/tickers for feature engineering: 33

--- Defining ALL Feature Specifications ---
Defined 3359 total feature specifications.
--- Calculating All Features ---
Calculated 3132 feature series.
Successfully created sequential feature: 'SEQ_VIX_SPIKE_THEN_CORR_DROP'
Successfully created sequential feature: 'SEQ_YIELD_DROP_THEN_GOLD_VOL_SPIKE'
Successfully created sequential feature: 'SEQ_NVDA_VOL_SPIKE_THEN_QQQ_PRICE_RISE'
--- Defining Primitive Signals ---
Defined 10325 primitive signals.

--- GENETIC ALGORITHM: Creating Initial Population (Generation 0) ---
Seeding with pre-validated single and pair setups...
  - Created 200 pre-validated setups.
Filling remainder of population with pre-validated random setups...
Created initial population of 250 guaranteed viable setups.

--- Evaluating Generation 1/50 ---
Generation 1 Complete. Unique Phenotypes: 250. Hall of Fame: 9. Best: (S:999.00, C:999.00, Sup:9)

--- Ev

In [2]:
# Feature Engineering + Discovery Engine with FULLY Migrated Structured Features
import pandas as pd
import numpy as np
import random
import warnings
import functools
import os
import itertools
import json
from joblib import Parallel, delayed

warnings.filterwarnings('ignore')

# --- DEFINITIONS AND CONFIGURATION ---

# --- PRIORITY 1: IMPLEMENT FULL REPRODUCIBILITY ---
# Global seed for all sources of randomness to ensure script reproducibility.
RANDOM_SEED = 50
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
# Note: joblib.Parallel's reproducibility is handled by ensuring the functions it calls
# are deterministic or are themselves seeded, which our new evaluation logic ensures.

# Define the explicit list of tradable tickers
TRADABLE_TICKERS = [
    'QQQ US Equity', 'SPY US Equity', 'XLK US Equity', 'XLF US Equity',
    'XLE US Equity', 'ARKK US Equity', 'VIX Index', 'GLD US Equity',
    'NBIS US Equity', 'LLY US Equity', 'TSLA US Equity', 'AAPL US Equity',
    'NVDA US Equity'
]

# --- Single Ticker Mode Filter ---
# Set to a specific ticker (e.g., 'SPY US Equity') to focus the entire discovery engine.
# Set to None to run on all tradable tickers.
SINGLE_TICKER_MODE = 'QQQ US Equity'

if SINGLE_TICKER_MODE:
    print(f"\n--- RUNNING IN SINGLE TICKER MODE FOR: {SINGLE_TICKER_MODE} ---")
    if SINGLE_TICKER_MODE not in TRADABLE_TICKERS:
        raise ValueError(f"Ticker '{SINGLE_TICKER_MODE}' not found in the original TRADABLE_TICKERS list.")
    TRADABLE_TICKERS = [SINGLE_TICKER_MODE]

# Define list of macro tickers to ensure their inclusion in feature generation
MACRO_TICKERS = [
    'DXY Curncy', 'USGG10YR Index', 'USGG2YR Index', 'CPI YOY Index',
    'INJCJC Index', 'FFA Comdty', 'LF94TRUU Index', 'CPI CHNG Index',
    'NFP TCH Index', 'JOBS US Equity', 'CTII10 Govt', 'USSW10 Curncy',
    'MLCX3CRT Index', 'FARBAST Index', 'BSPGCPUS Index', 'SPCSUSA Index',
    'SPCS20SM Index', 'CONSSENT Index', 'CO1 Comdty'
]

# --- UPDATED FILE PATHS ---
MAIN_DATA_FILE = 'All_tickers_new.xlsx'
MACRO_DATA_FILE = 'Macro_tickers_new.xlsx'

# --- GENETIC ALGORITHM CONFIGURATION ---
NUM_GENERATIONS = 10     # How many evolutionary cycles to run
POPULATION_SIZE = 50     # How many setups (individuals) in each generation
SETUP_LENGTHS_TO_EXPLORE = [2, 3] # Allow setups of 2 or 3 conditions
ELITISM_RATE = 0.1       # Percentage of the best setups to keep untouched for the next generation
MUTATION_RATE = 0.20     # ---GA v2.1: More aggressive mutation rate---
# General Configuration
MIN_INITIAL_SUPPORT_FILTER = 5
OPTION_SIM_HORIZONS_DAYS = [1, 3, 10, 21]
RISK_FREE_RATE = 0.01
RECENCY_WINDOW = 10 # How many recent trades to check for performance decay
# --- Single Ticker Mode Filter ---
# Set to a specific ticker (e.g., 'SPY US Equity') to focus the entire discovery engine.
# Set to None to run on all tradable tickers.
SINGLE_TICKER_MODE = 'LLY US Equity'
# --- END DEFINITIONS AND CONFIGURATION ---


print('Loading raw workbooks…')


# --- MODIFIED Custom Data Loading Function ---
def load_and_merge_excel(file_path, header_row, existing_df=None):
    """Loads an Excel file, prepends sheet names to columns (except Date), and merges."""
    try:
        xls = pd.ExcelFile(file_path)
        current_df = existing_df.copy() if existing_df is not None else None
        for sh_name in xls.sheet_names:
            # Use the specified header_row to correctly read the file
            df = pd.read_excel(xls, sheet_name=sh_name, header=header_row)

            if 'Dates' in df.columns and 'Date' not in df.columns:
                df.rename(columns={'Dates': 'Date'}, inplace=True)
            if 'Date' not in df.columns:
                print(f"Warning: Sheet '{sh_name}' in '{file_path}' is missing a 'Date'/'Dates' column. Skipping sheet.")
                continue
            df.columns = [f"{sh_name}_{col}" if col != 'Date' else col for col in df.columns]
            if current_df is None:
                current_df = df
            else:
                df = df.loc[:,~df.columns.duplicated()]
                current_df = current_df.merge(df, on='Date', how='outer')
        return current_df
    except FileNotFoundError:
        print(f"Error: '{file_path}' not found.")
        return existing_df
    except Exception as e:
        print(f"An unexpected error occurred during Excel loading of '{file_path}': {e}")
        return existing_df


# --- MODIFIED Load main and macro data ---
# Load main tickers file, specifying headers are on Row 2 (index 1)
raw = load_and_merge_excel(MAIN_DATA_FILE, header_row=1)
if raw is not None and not raw.empty:
    # Load macro file, specifying headers are on Row 5 (index 4)
    raw = load_and_merge_excel(MACRO_DATA_FILE, header_row=4, existing_df=raw)
else:
    print("Main data could not be loaded, skipping macro data loading.")
    raw = pd.DataFrame()

# Final cleaning and indexing
if not raw.empty:
    raw = raw.sort_values('Date').reset_index(drop=True)
    raw.fillna(method='ffill', inplace=True)
    if 'Date' in raw.columns:
        raw['Date'] = pd.to_datetime(raw['Date'])
        raw = raw.drop_duplicates(subset=['Date'], keep='last')
        raw.set_index('Date', inplace=True)
        raw.index = pd.to_datetime(raw.index)
        raw.sort_index(inplace=True)
else:
    print("No data loaded. Raw DataFrame is empty.")
    raw = pd.DataFrame()

print('Raw shape:', raw.shape)

# --- Dynamic Ticker Identification ---
all_column_prefixes = sorted(list(set([c.split('_')[0] for c in raw.columns if '_' in c])))
COMMON_FEATURE_PREFIXES = ['Last', 'Open', 'High', 'Low', 'VWAP', 'Volume', 'IVOL', 'Implied', 'Total', '30', '10',
                           '60', 'Hist.', '1st', 'Put', 'Dates', 'CHG', 'FFA', 'INJCJC', 'NFP', 'JOBS', 'CPI', 'CTII10',
                           'LF94TRUU', 'SPX', 'USSW10', 'MLCX3CRT', 'FARBAST', 'BSPGCPUS', 'SPCSUSA', 'SPCS20SM',
                           'CONSSENT']
actual_ticker_prefixes = [p for p in all_column_prefixes if p not in COMMON_FEATURE_PREFIXES]
all_tickers = sorted(list(set(TRADABLE_TICKERS + actual_ticker_prefixes + MACRO_TICKERS)))
print(f'\nIdentified all relevant prefixes/tickers for feature engineering: {len(all_tickers)}')


# --- Helper functions ---
def first_col_containing(ticker_full_name, substr=''):
    if substr == 'PX_LAST':
        for potential_col in [f"{ticker_full_name}_Last_Price_PX_LAST", f"{ticker_full_name}_PX_LAST"]:
            if potential_col in raw.columns: return potential_col
    for c in raw.columns:
        if c.startswith(ticker_full_name) and substr in c: return c
    return None

# --- PRODUCTION-GRADE HELPER TOOLKIT (DEFINITIVE) ---
_series_cache = {}
def safe_series(col_name, use_cache=True):
    """Cached and robust series retrieval."""
    if use_cache and col_name in _series_cache:
        return _series_cache[col_name]

    series = raw[col_name] if col_name and col_name in raw.columns else pd.Series(index=raw.index, dtype=float)

    if use_cache:
        _series_cache[col_name] = series
    return series

def zscore_rolling(s, win=252, eps=1e-9, minp=60):
    """Calculates a rolling z-score, handling zero variance."""
    m = s.rolling(win, min_periods=minp).mean()
    v = s.rolling(win, min_periods=minp).std()
    return (s - m) / (v.replace(0, eps))

def mad_z(s, win=252, c=1.4826, minp=60):
    """Calculates a robust z-score using Median Absolute Deviation (MAD)."""
    med = s.rolling(win, min_periods=minp).median()
    mad = (s - med).abs().rolling(win, min_periods=minp).median()
    return (s - med) / (c * mad.replace(0, np.nan))

def rolling_pct_of_last(s, win=252, minp=60):
    """A faster rolling percentile rank of the last value in the window."""
    def pct_last(a):
        a = a[~np.isnan(a)]
        if a.size == 0: return np.nan
        return np.sum(a <= a[-1]) / a.size
    return s.rolling(win, min_periods=minp).apply(pct_last, raw=True)

def norm_feature(s, method='z', win=252):
    """Universal normalizer wrapper."""
    if s.empty or s.isna().all(): return s
    if method == 'z': return zscore_rolling(s, win)
    if method == 'madz': return mad_z(s, win)
    if method == 'pct': return rolling_pct_of_last(s, win)
    return s

def returns(px, periods=1):
    """Convenience wrapper for percentage change returns with variable periods."""
    return px.pct_change(periods)

def to_bp(yield_series):
    """Robustly converts a yield series to basis points."""
    q95 = yield_series.dropna().quantile(0.95)
    # Heuristic: If 95th percentile is a low number (e.g. < 20), assume it's in percent, not decimals.
    multiplier = 100.0 if (q95 is not None and q95 <= 20.0) else 10000.0
    return yield_series * multiplier

def align2(s1, s2):
    """Aligns two series by their index and drops NaNs."""
    df = pd.concat([s1, s2], axis=1).dropna()
    if df.empty:
        return pd.Series(dtype=float), pd.Series(dtype=float)
    return df.iloc[:,0], df.iloc[:,1]

def rolling_corr_fisher(s1, s2, win=20, minp_corr=20):
    """Calculates rolling correlation and applies the Fisher transform."""
    r1, r2 = align2(s1, s2)
    if r1.empty: return pd.Series(dtype=float, index=s1.index), pd.Series(dtype=float, index=s1.index)
    r = r1.rolling(win, min_periods=minp_corr).corr(r2).clip(-0.999999, 0.999999)
    f = 0.5 * np.log((1 + r) / (1 - r))
    return f, r

def rolling_corr_fisher_z(s1, s2, win_corr=20, win_z=60, minp_corr=20, minp_z=40):
    """Calculates rolling correlation, applies Fisher transform, and then a rolling z-score."""
    f, _ = rolling_corr_fisher(s1, s2, win=win_corr, minp_corr=minp_corr)
    return zscore_rolling(f, win=win_z, minp=minp_z)

def beta_rolling(s1, s2, win=60, eps=1e-12, minp=40):
    """Calculates rolling beta robustly."""
    r1, r2 = align2(s1, s2)
    if r1.empty: return pd.Series(dtype=float, index=s1.index)
    cov = r1.rolling(win, min_periods=minp).cov(r2)
    var = r2.rolling(win, min_periods=minp).var()
    return cov / var.replace(0, eps)

def cooccur(a, b, window=2):
    """Checks for co-occurrence of two boolean series within a rolling window."""
    return a.rolling(window).max().astype(bool) & b.rolling(window).max().astype(bool)

def get_realized_vol(price_series, win=30):
    """Calculates realized volatility as a fallback."""
    log_returns = np.log(price_series / price_series.shift(1))
    return log_returns.rolling(win).std() * np.sqrt(252)

def event_delta(series):
    """Calculates change only on days where the value is not NaN (i.e., on event days)."""
    s_ffill = series.ffill()
    change = s_ffill.diff()
    return change.where(series.notna())

def coverage_activation(feat_df, hi=1.5):
    """Diagnostic function to check feature coverage and activation rates."""
    rows = []
    for c in feat_df.columns:
        s = feat_df[c].dropna()
        if s.empty: continue
        cov = len(s) / len(feat_df)

        # FIX: Prefer the explicit _z twin for activation calculation if it exists
        z_col_name = f"{c}_z"
        if z_col_name in feat_df.columns:
            z = feat_df[z_col_name].dropna()
        else: # Otherwise, compute it, but only if not already a z-score
            is_z_like = '_z' in c or (s.mean() < 0.1 and s.std() > 0.5 and s.std() < 1.5)
            z = s if is_z_like else norm_feature(s)

        act = (z.abs() > hi).mean() if not z.empty else 0
        rows.append((c, round(cov,3), round(act,3)))
    report = pd.DataFrame(rows, columns=['feature','coverage_pct','activation_pct']).sort_values('activation_pct', ascending=False)
    print("--- Feature Coverage & Activation Report ---")
    print(report.to_string())
    return report

def frac_diff(series, d=0.5, window=100):
    weights = [1.];
    for k in range(1, len(series)): weights.append(-weights[-1] * (d - k + 1) / k)
    weights = np.array(weights[::-1]); output = pd.Series(index=series.index, dtype=float)
    for i in range(window, len(series)):
        subset = series.iloc[i - window + 1: i + 1]
        if len(subset) == len(weights[-window:]): output.iloc[i] = np.dot(weights[-window:], subset)
    return output.dropna()

def block_bootstrap_sharpe(returns_series, block_size, num_iterations=1000, annualize=True, trading_days_per_year=252):
    returns_series = returns_series.dropna()
    if len(returns_series) < block_size or len(returns_series) < 2: return 0.0, 0.0, 0.0
    blocks = [returns_series.iloc[i: i + block_size] for i in range(0, len(returns_series), block_size) if not returns_series.iloc[i: i + block_size].empty]
    if not blocks: return 0.0, 0.0, 0.0
    n_blocks_to_sample = int(np.ceil(len(returns_series) / block_size)); sharpes = []
    for _ in range(num_iterations):
        resampled_returns_list = [blocks[i] for i in np.random.choice(len(blocks), n_blocks_to_sample, replace=True)]
        resampled_returns = pd.concat(resampled_returns_list).iloc[:len(returns_series)]
        if resampled_returns.std() > 1e-9:
            sharpes.append((resampled_returns.mean() / resampled_returns.std()) * (np.sqrt(trading_days_per_year) if annualize else 1))
        else: sharpes.append(0.0)
    if not sharpes: return 0.0, 0.0, 0.0
    return np.median(sharpes), np.percentile(sharpes, 5), np.percentile(sharpes, 95)

def calculate_sortino_ratio(returns_series, annualize=True, trading_days_per_year=252):
    """Calculates the Sortino Ratio."""
    returns_series = returns_series.dropna()
    if len(returns_series) < 2: return 0.0

    target_return = 0
    downside_returns = returns_series[returns_series < target_return]

    if len(downside_returns) == 0: return np.inf

    expected_return = returns_series.mean()
    downside_std = downside_returns.std()

    if downside_std == 0: return np.inf

    sortino = (expected_return - target_return) / downside_std
    return sortino * np.sqrt(trading_days_per_year) if annualize else sortino

def calculate_calmar_ratio(returns_series, annualize=True, trading_days_per_year=252):
    """Calculates the Calmar Ratio."""
    returns_series = returns_series.dropna()
    if len(returns_series) < 2: return 0.0

    cumulative_returns = (1 + returns_series).cumprod()
    peak = cumulative_returns.cummax()
    drawdown = (cumulative_returns - peak) / peak
    max_drawdown = drawdown.min()

    if max_drawdown == 0: return np.inf

    total_return = cumulative_returns.iloc[-1] - 1

    if (1 + total_return) < 0:
        return -99

    num_years = len(returns_series) / trading_days_per_year
    annualized_return = (1 + total_return)**(1/num_years) - 1

    calmar = annualized_return / abs(max_drawdown)
    return calmar

# --- Option Simulation Helpers ---
def estimate_atm_premium(price, ivol, days, option_type):
    T = days / 365.25
    if T <= 0 or price <= 0 or ivol <= 0: return 0
    return 0.4 * price * ivol * np.sqrt(T)

def simulate_option_pnl_detailed(current_price, future_price, ivol_at_entry, horizon_days, entry_direction):
    underlying_return = (future_price - current_price) / current_price if current_price and pd.notna(current_price) and pd.notna(future_price) else np.nan
    nan_result = {'pnl_per_share': np.nan, 'option_type': None, 'strike_price': np.nan, 'entry_premium': np.nan,
                  'exit_value': np.nan, 'pnl_dollars': np.nan, 'pnl_pct': np.nan, 'skipped_reason': 'None',
                  'Underlying_Exit_Price': future_price if pd.notna(future_price) else np.nan, 'Return_Underlying': underlying_return,}
    if pd.isna(current_price) or current_price <= 0:
        nan_result['skipped_reason'] = 'Invalid Entry Price'; return nan_result
    if pd.isna(ivol_at_entry) or ivol_at_entry <= 0:
        nan_result['skipped_reason'] = 'Invalid IVOL'; return nan_result
    if pd.isna(future_price):
        nan_result['skipped_reason'] = 'Missing Future Price'; return nan_result
    if entry_direction not in ['long', 'short']:
        nan_result['skipped_reason'] = 'Invalid Entry Direction'; return nan_result

    scaled_ivol = ivol_at_entry / 100.0 if ivol_at_entry > 1.0 else ivol_at_entry
    strike_price = current_price
    option_type = 'call' if entry_direction == 'long' else 'put'
    entry_premium = estimate_atm_premium(current_price, scaled_ivol, horizon_days, option_type)

    if option_type == 'call': exit_value = max(future_price - strike_price, 0)
    else: exit_value = max(strike_price - future_price, 0)

    pnl_per_share = exit_value - entry_premium
    pnl_dollars = pnl_per_share * 100

    pnl_pct = (pnl_per_share / entry_premium) * 100 if entry_premium > 0 else np.nan

    return {'pnl_per_share': pnl_per_share, 'option_type': option_type, 'strike_price': strike_price,
            'entry_premium': entry_premium, 'exit_value': exit_value, 'pnl_dollars': pnl_dollars,
            'pnl_pct': pnl_pct, 'skipped_reason': 'None', 'Underlying_Exit_Price': future_price,
            'Return_Underlying': underlying_return,}
# --- END HELPER FUNCTIONS ---

# --- 1. Define Feature Specifications ---
print('\n--- Defining ALL Feature Specifications ---')
feature_specs = []
# Volatility Features
for ticker in all_tickers:
    f60 = '60_Day_Call_Implied_Volatility'; f10 = '10_Day_Call_Implied_Volatility'
    feature_specs.append({'type': 'ivol_term_structure', 'assets': [ticker], 'params': {'f_long':f60, 'f_short':f10},
                          'unique_id': f'term_structure_{f60}-{f10}__{ticker}', 'display_name': f"diff({f60}, {f10})__{ticker}"})
    put50 = '1st_Month_Put_Imp_Vol_50_Delta'; call40 = '1st_Month_Call_Imp_Vol_40_Delta'
    feature_specs.append({'type': 'ivol_skew', 'assets': [ticker], 'params': {'put':put50, 'call':call40},
                          'unique_id': f'skew_{put50}-{call40}__{ticker}', 'display_name': f"diff({put50}, {call40})__{ticker}"})
    for suffix in ['IVOL_SIGMA', 'CALL_IMP_VOL_30D', 'PUT_IMP_VOL_30D']:
        feature_specs.append({'type': 'ivol_shock', 'assets': [ticker], 'params': {'ivol_suffix': suffix, 'window': 30},
                              'unique_id': f'zscore_{suffix}_30d__{ticker}', 'display_name': f"zscore_{suffix}_30d__{ticker}"})
        feature_specs.append({'type': 'ivol_div_volume', 'assets': [ticker], 'params': {'ivol_suffix': suffix, 'vol_suffix':'VOLUME'},
                              'unique_id': f'div_{suffix}_by_VOLUME__{ticker}', 'display_name': f"div({suffix}, VOLUME)__{ticker}"})
# Deriv Flow & Sentiment Features
for ticker in all_tickers:
    pc_ratio_col = 'PUT_CALL_VOLUME_RATIO_CUR_DAY'
    feature_specs.append({'type': 'put_call_ratio_ema', 'assets': [ticker], 'params': {'span': 5, 'col': pc_ratio_col},
                          'unique_id': f'ema5_{pc_ratio_col}__{ticker}', 'display_name': f"ema5_{pc_ratio_col}__{ticker}"})
    oi_col = 'OPEN_INT_TOTAL_CALL'
    feature_specs.append({'type': 'open_interest_change', 'assets': [ticker], 'params': {'days': 3, 'col': oi_col},
                          'unique_id': f'pct_change_{oi_col}_3d__{ticker}', 'display_name': f"pct_change_{oi_col}_3d__{ticker}"})
    vol_col = 'Volume_-Realtime_VOLUME'
    feature_specs.append({'type': 'volume_zscore', 'assets': [ticker], 'params': {'window': 30, 'col': vol_col},
                          'unique_id': f'zscore_{vol_col}_30d__{ticker}', 'display_name': f"zscore_{vol_col}_30d__{ticker}"})
    sm_oi = 'OPEN_INT_TOTAL_CALL'; sm_ivol='10_Day_Call_Implied_Volatility'
    feature_specs.append({'type': 'smart_money_flag', 'assets': [ticker], 'params': {'oi_col': sm_oi, 'ivol_col': sm_ivol},
                          'unique_id': f'smart_money_{sm_oi}_{sm_ivol}__{ticker}', 'display_name': f"smart_money(pct_change({sm_oi}) > 0 AND pct_change({sm_ivol}) > 0)__{ticker}"})
# Generic Z-Score feature needed for sequential patterns
for ticker in all_tickers:
    for col in ['PX_LAST', 'IVOL_SIGMA', 'Volume_-Realtime_VOLUME']:
        for window in [30, 60]:
            feature_specs.append({'type': 'generic_zscore', 'assets': [ticker], 'params': {'col': col, 'window': window},
                                  'unique_id': f'zscore_{col}_{window}d__{ticker}', 'display_name': f"zscore({col}, {window}d)__{ticker}"})
# Cross-Asset Correlation Features
price_col = 'PX_LAST'
correlation_pairs = list(set(itertools.combinations(all_tickers, 2)))
for t1, t2 in correlation_pairs:
    for window in [20, 60]:
        feature_specs.append({'type': 'correlation', 'assets': [t1, t2], 'params': {'window': window, 'col': price_col},
                              'unique_id': f'corr_{t1}:{price_col}_{t2}:{price_col}_{window}d', 'display_name': f"corr({t1}:{price_col}, {t2}:{price_col}, {window}d)"})
    feature_specs.append({'type': 'correlation_zscore', 'assets': [t1, t2], 'params':{'col':price_col, 'window':60},
                          'unique_id': f'zscore_corr20d_{t1}:{price_col}_{t2}:{price_col}_60d', 'display_name': f"zscore_corr(20d)({t1}:{price_col}, {t2}:{price_col}, 60d)"})
    feature_specs.append({'type': 'correlation_delta', 'assets': [t1, t2], 'params':{'col':price_col},
                          'unique_id': f'corr_delta_{t1}:{price_col}_{t2}:{price_col}', 'display_name': f"corr_delta(20d-60d)({t1}:{price_col}, {t2}:{price_col})"})
    feature_specs.append({'type': 'rolling_beta', 'assets': [t1, t2], 'params': {'window': 60, 'col':price_col},
                          'unique_id': f'beta_{t1}:{price_col}_{t2}:{price_col}_60d', 'display_name': f"beta({t1}:{price_col}, {t2}:{price_col}, 60d)"})
# Advanced Correlations
adv_corr_defs = [
    {'t1': 'QQQ US Equity', 'f1': 'IVOL_SIGMA', 't2': 'SPY US Equity', 'f2': 'IVOL_SIGMA', 'win': 30},
    {'t1': 'TSLA US Equity', 'f1': 'Volume_-Realtime_VOLUME', 't2': 'VIX Index', 'f2': 'IVOL_SIGMA', 'win': 20},
    {'t1': 'CO1 Comdty', 'f1': 'PX_LAST', 't2': 'XLE US Equity', 'f2': 'IVOL_SIGMA', 'win': 30},
    {'t1': 'USGG10YR Index', 'f1': 'PX_LAST', 't2': 'XLF US Equity', 'f2': 'IVOL_SIGMA', 'win': 30}
]
for d in adv_corr_defs:
    feature_specs.append({
        'type': 'advanced_correlation', 'assets': [d['t1'], d['t2']], 'params': {'window': d['win'], 'col1': d['f1'], 'col2': d['f2']},
        'unique_id': f"corr_{d['t1']}:{d['f1']}_{d['t2']}:{d['f2']}_{d['win']}d", 'display_name': f"corr({d['t1']}:{d['f1']}, {d['t2']}:{d['f2']}, {d['win']}d)"})
# Macro Features
feature_specs.extend([
    {'type': 'macro_mpi', 'assets': ['DXY Curncy', 'USGG10YR Index'], 'unique_id': 'macro_mpi', 'display_name': 'Macro Pressure Index'},
    {'type': 'macro_fear_overdrive', 'assets': ['VIX Index', 'DXY Curncy', 'SPY US Equity'], 'unique_id': 'macro_fear_overdrive', 'display_name': 'Fear Overdrive'},
    {'type': 'macro_sector_rotation', 'assets': ['XLK US Equity', 'XLE US Equity'], 'unique_id': 'macro_xlk_xle_rotation', 'display_name': 'Sector Rotation (XLK-XLE)'},
    {'type': 'macro_yield_spread', 'assets': ['USGG10YR Index', 'USGG2YR Index'], 'unique_id': 'macro_10y2y_spread', 'display_name': 'Yield Spread (10Y-2Y)'},
    {'type': 'macro_cpi_zscore', 'assets': ['CPI YOY Index'], 'unique_id': 'macro_cpi_z', 'display_name': 'CPI Z-Score'},
    {'type': 'macro_injcjc_shock', 'assets': ['INJCJC Index'], 'unique_id': 'macro_jobless_claims_shock', 'display_name': 'Jobless Claims Shock'},
    {'type': 'macro_ffa_spread', 'assets': ['FFA Comdty', 'USGG2YR Index'], 'unique_id': 'macro_ffa_spread', 'display_name': 'Fed Funds Spread'},
    {'type': 'macro_lf94truu_vol_signal', 'assets': ['LF94TRUU Index'], 'unique_id': 'macro_hyg_vol_signal', 'display_name': 'HYG Vol Signal'}])
for t in ['CPI YOY Index', 'CPI CHNG Index', 'NFP TCH Index', 'JOBS US Equity']:
    feature_specs.append({'type': 'macro_generic_mom', 'assets': [t], 'params': {'days': 3}, 'unique_id': f'macro_mom3_{t}', 'display_name': f'Macro Mom3d({t})'})
for t in ['CTII10 Govt', 'USSW10 Curncy', 'MLCX3CRT Index', 'FARBAST Index', 'BSPGCPUS Index', 'SPCSUSA Index', 'SPCS20SM Index', 'CONSSENT Index']:
    feature_specs.append({'type': 'macro_generic_chg', 'assets': [t], 'unique_id': f'macro_chg_{t}', 'display_name': f'Macro Chg({t})'})
# Momentum / Volatility Fractal Features
for ticker in all_tickers:
    feature_specs.append({'type': 'mom_div_vol', 'assets': [ticker], 'params': {'price_col':price_col, 'mom_win':5, 'vol_win':20},
                          'unique_id': f'mom_div_vol_{price_col}_5d_20d__{ticker}', 'display_name': f"mom_div_vol({price_col}, 5d, 20d)__{ticker}"})
    feature_specs.append({'type': 'bollinger_pctB', 'assets': [ticker], 'params': {'window': 20, 'price_col':price_col},
                          'unique_id': f'pctB_{price_col}_20d__{ticker}', 'display_name': f"%B({price_col}, 20d)__{ticker}"})
    feature_specs.append({'type': 'fractional_differencing', 'assets': [ticker], 'params': {'d': 0.5, 'window': 100, 'price_col':price_col},
                          'unique_id': f'frac_diff_{price_col}_d0.5_100w__{ticker}', 'display_name': f"frac_diff({price_col}, d=0.5, win=100)__{ticker}"})
# Market Regime and Interaction Features
feature_specs.append({'type': 'regime_filter', 'assets': ['VIX Index'], 'params': {'threshold': 25, 'col': 'PX_LAST'},
                      'unique_id': 'REGIME_IS_HIGH_VOL', 'display_name': 'REGIME_IS_HIGH_VOL (VIX > 25)'})
feature_specs.append({'type': 'interaction', 'assets': [],
                      'params': {'feature1': 'zscore_IVOL_SIGMA_30d__AAPL US Equity', 'feature2': 'REGIME_IS_HIGH_VOL'},
                      'unique_id': 'zscore_IVOL_SIGMA_30d__AAPL US Equity_IN_HIGH_VOL',
                      'display_name': 'zscore(IVOL_SIGMA, 30d)__AAPL US Equity IN_HIGH_VOL'})
print(f"Defined {len(feature_specs)} total feature specifications.")

# --- 2. REBUILT Feature Calculation Engine (DEFINITIVE) ---
print('--- Building raw feature set... ---')

feat_raw = pd.DataFrame(index=raw.index)
_series_cache = {} # Clear cache before starting

# --- Feature Calculation Loop ---
for spec in feature_specs:
    feature_id = spec['unique_id']
    try:
        # --- Volatility Features ---
        if spec['type'] == 'ivol_term_structure':
            s_long = safe_series(first_col_containing(spec['assets'][0], spec['params']['f_long']))
            s_short = safe_series(first_col_containing(spec['assets'][0], spec['params']['f_short']))
            raw_val = s_long - s_short
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val, method='z')

        elif spec['type'] == 'ivol_skew':
            s_put = safe_series(first_col_containing(spec['assets'][0], spec['params']['put']))
            s_call = safe_series(first_col_containing(spec['assets'][0], spec['params']['call']))
            raw_val = s_put - s_call
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val, method='madz')

        elif spec['type'] == 'ivol_shock':
            ivol_s = safe_series(first_col_containing(spec['assets'][0], spec['params']['ivol_suffix']))
            dv = ivol_s.diff()
            std = dv.ewm(span=spec['params']['window'], min_periods=10).std().replace(0, np.nan)
            raw_val = (dv - dv.rolling(spec['params']['window']).mean()) / std
            feat_raw[feature_id] = raw_val

        elif spec['type'] == 'ivol_div_volume':
            ivol_s = safe_series(first_col_containing(spec['assets'][0], spec['params']['ivol_suffix']))
            vol_s = safe_series(first_col_containing(spec['assets'][0], spec['params']['vol_suffix']))
            feat_raw[feature_id] = norm_feature(ivol_s) - norm_feature(vol_s)

        # --- Derivatives Flow & Sentiment ---
        elif spec['type'] == 'put_call_ratio_ema':
            pc = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            raw_val = pc.ewm(span=spec['params']['span']).mean()
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val)

        elif spec['type'] == 'open_interest_change':
            oi = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            raw_val = returns(oi, periods=spec['params']['days'])
            clipped_val = raw_val.clip(raw_val.quantile(0.01), raw_val.quantile(0.99))
            feat_raw[feature_id] = clipped_val
            feat_raw[f"{feature_id}_z"] = norm_feature(clipped_val)

        elif spec['type'] == 'volume_zscore':
            vol = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            feat_raw[feature_id] = norm_feature(vol, win=spec['params']['window'])

        elif spec['type'] == 'smart_money_flag':
            oi = safe_series(first_col_containing(spec['assets'][0], spec['params']['oi_col']))
            ivol = safe_series(first_col_containing(spec['assets'][0], spec['params']['ivol_col']))
            oi_up = (returns(oi) > 0)
            iv_up = (returns(ivol) > 0)
            feat_raw[feature_id] = cooccur(oi_up, iv_up, window=2).astype(int)

        # --- Generic & Cross-Asset ---
        elif spec['type'] == 'generic_zscore':
            s = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            feat_raw[feature_id] = norm_feature(s, win=spec['params']['window'])

        elif spec['type'] == 'correlation':
            t1, t2, win = spec['assets'][0], spec['assets'][1], spec['params']['window']
            r1 = returns(safe_series(first_col_containing(t1, 'PX_LAST')))
            r2 = returns(safe_series(first_col_containing(t2, 'PX_LAST')))
            f, r = rolling_corr_fisher(r1, r2, win=win)
            feat_raw[feature_id] = r
            feat_raw[f"{feature_id}_z"] = zscore_rolling(f, win=60)

        elif spec['type'] == 'advanced_correlation':
            t1, t2, win = spec['assets'][0], spec['assets'][1], spec['params']['window']
            s1 = norm_feature(safe_series(first_col_containing(t1, spec['params']['col1'])))
            s2 = norm_feature(safe_series(first_col_containing(t2, spec['params']['col2'])))
            f, r = rolling_corr_fisher(s1, s2, win=win)
            feat_raw[feature_id] = r
            feat_raw[f"{feature_id}_z"] = zscore_rolling(f, win=60)

        elif spec['type'] == 'correlation_zscore':
            t1, t2 = spec['assets']
            r1 = returns(safe_series(first_col_containing(t1, 'PX_LAST')))
            r2 = returns(safe_series(first_col_containing(t2, 'PX_LAST')))
            f_20, _ = rolling_corr_fisher(r1, r2, win=20)
            feat_raw[feature_id] = zscore_rolling(f_20, win=60)

        elif spec['type'] == 'correlation_delta':
            t1, t2 = spec['assets']
            r1 = returns(safe_series(first_col_containing(t1, 'PX_LAST')))
            r2 = returns(safe_series(first_col_containing(t2, 'PX_LAST')))
            f20, _ = rolling_corr_fisher(r1, r2, win=20)
            f60, _ = rolling_corr_fisher(r1, r2, win=60)
            raw_val = f20 - f60
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val)

        elif spec['type'] == 'rolling_beta':
            t1, t2, win = spec['assets'][0], spec['assets'][1], spec['params']['window']
            r1 = returns(safe_series(first_col_containing(t1, 'PX_LAST')))
            r2 = returns(safe_series(first_col_containing(t2, 'PX_LAST')))
            raw_val = beta_rolling(r1, r2, win=win)
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val)

        # --- Macro Features (Full Coverage & Correct Logic) ---
        elif spec['type'] == 'macro_mpi':
            dxy_px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            ust10_yield = safe_series(first_col_containing(spec['assets'][1], 'PX_LAST'))
            leg1 = norm_feature(returns(dxy_px).rolling(3).sum())
            leg2 = norm_feature(to_bp(ust10_yield).diff().rolling(3).sum())
            feat_raw[feature_id] = leg1 + leg2

        elif spec['type'] == 'macro_fear_overdrive':
            vix_z = norm_feature(safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')))
            dxy_z = norm_feature(returns(safe_series(first_col_containing(spec['assets'][1], 'PX_LAST'))))
            spy_px = safe_series(first_col_containing(spec['assets'][2], 'PX_LAST'))
            spy_z = norm_feature(spy_px - spy_px.rolling(20).mean())
            score = (vix_z > 0.7).astype(int) + (dxy_z > 0.5).astype(int) + (spy_z < -0.5).astype(int)
            feat_raw[feature_id] = score

        elif spec['type'] == 'macro_sector_rotation':
            r_xlk = returns(safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')), periods=5)
            r_xle = returns(safe_series(first_col_containing(spec['assets'][1], 'PX_LAST')), periods=5)
            raw_val = r_xlk - r_xle
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val)

        elif spec['type'] == 'macro_yield_spread':
            s10y = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            s2y = safe_series(first_col_containing(spec['assets'][1], 'PX_LAST'))
            raw_val = to_bp(s10y) - to_bp(s2y)
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val)

        elif spec['type'] == 'macro_cpi_zscore':
            cpi = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')).dropna()
            feat_raw[feature_id] = norm_feature(cpi, win=12)

        elif spec['type'] == 'macro_injcjc_shock':
            injcjc = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')).dropna()
            change = event_delta(injcjc)
            feat_raw[feature_id] = (change > change.rolling(20).std() * 1.5).astype(int)

        elif spec['type'] == 'macro_ffa_spread':
            ffa_z = norm_feature(safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')))
            ust2_z = norm_feature(safe_series(first_col_containing(spec['assets'][1], 'PX_LAST')))
            feat_raw[feature_id] = ffa_z - ust2_z

        elif spec['type'] == 'macro_generic_mom' or spec['type'] == 'macro_generic_chg':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            change = event_delta(px)
            feat_raw[feature_id] = change
            feat_raw[f"{feature_id}_z"] = norm_feature(change)

        elif spec['type'] == 'macro_lf94truu_vol_signal':
            s = safe_series(first_col_containing(spec['assets'][0], 'VOLATILITY_30D'))
            if s.dropna().empty:
                s = get_realized_vol(safe_series(first_col_containing(spec['assets'][0], 'PX_LAST')))
            raw_val = s / s.rolling(60).mean()
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val)

        # --- Momentum & Fractal ---
        elif spec['type'] == 'mom_div_vol':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            mom5 = returns(px, 5)
            vol5 = returns(px).rolling(5).std().replace(0, np.nan)
            raw_val = mom5 / vol5
            feat_raw[feature_id] = raw_val
            feat_raw[f"{feature_id}_z"] = norm_feature(raw_val)

        elif spec['type'] == 'bollinger_pctB':
            px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
            ma = px.rolling(20).mean()
            std = px.rolling(20).std().replace(0, 1e-9)
            raw_val = (px - (ma - 2*std)) / (4 * std)
            feat_raw[feature_id] = raw_val.clip(-0.5, 1.5)

        elif spec['type'] == 'fractional_differencing':
             px = safe_series(first_col_containing(spec['assets'][0], 'PX_LAST'))
             feat_raw[feature_id] = frac_diff(px, d=spec['params']['d'], window=spec['params']['window'])

        # --- Regime & Interaction (must be last in this loop) ---
        elif spec['type'] == 'regime_filter':
            px = safe_series(first_col_containing(spec['assets'][0], spec['params']['col']))
            feat_raw[feature_id] = (px > spec['params']['threshold']).astype(int)

    except Exception as e:
        print(f"Could not build feature '{feature_id}': {e}")

# --- FIX: SEQUENTIAL & INTERACTION FEATURES (PRE-SHIFT) ---
print("--- Building sequential & interaction features... ---")
# Interaction features
for spec in feature_specs:
    if spec['type'] == 'interaction':
        f1_id, f2_id = spec['params']['feature1'], spec['params']['feature2']
        if f1_id in feat_raw.columns and f2_id in feat_raw.columns:
            feat_raw[spec['unique_id']] = feat_raw[f1_id] * feat_raw[f2_id]

# Sequential features
try:
    # --- VIX Spike -> Corr Drop ---
    vix_z = norm_feature(safe_series(first_col_containing('VIX Index', 'PX_LAST')))
    qqq_r = returns(safe_series(first_col_containing('QQQ US Equity', 'PX_LAST')))
    spy_r = returns(safe_series(first_col_containing('SPY US Equity', 'PX_LAST')))
    corr_z = rolling_corr_fisher_z(qqq_r, spy_r)
    if all(s.notna().any() for s in [vix_z, corr_z]):
        event_A = (vix_z.shift(1) > 1.5)
        event_B = (corr_z < -1.5)
        feat_raw['SEQ_VIX_SPIKE_THEN_CORR_DROP'] = (event_A & event_B).astype(int)

    # --- Yield Drop -> Gold Vol Spike ---
    yield_z = norm_feature(safe_series(first_col_containing('USGG10YR Index', 'PX_LAST')))
    gold_vol = get_realized_vol(safe_series(first_col_containing('GLD US Equity', 'PX_LAST')))
    gold_vol_z = norm_feature(gold_vol)
    if all(s.notna().any() for s in [yield_z, gold_vol_z]):
        event_A = (yield_z.shift(1) < -1.5)
        event_B = (gold_vol_z > 1.5)
        feat_raw['SEQ_YIELD_DROP_THEN_GOLD_VOL_SPIKE'] = (event_A & event_B).astype(int)

    # --- NVDA Vol Spike -> QQQ Rise ---
    nvda_vol = safe_series(first_col_containing('NVDA US Equity', 'Volume_-Realtime_VOLUME'))
    nvda_vol_z = norm_feature(nvda_vol)
    qqq_px = safe_series(first_col_containing('QQQ US Equity', 'PX_LAST'))
    qqq_px_z = norm_feature(qqq_px)
    if all(s.notna().any() for s in [nvda_vol_z, qqq_px_z]):
        event_A = (nvda_vol_z.shift(1) > 1.5)
        event_B = (qqq_px_z > 1.5)
        feat_raw['SEQ_NVDA_VOL_SPIKE_THEN_QQQ_PRICE_RISE'] = (event_A & event_B).astype(int)
    print("Successfully built sequential features.")
except Exception as e:
    print(f"Could not build sequential features: {e}")

# --- MONITORING: Run diagnostics before the final shift ---
_ = coverage_activation(feat_raw)

# --- FINAL GLOBAL SHIFT ---
feat = feat_raw.shift(1)
_series_cache = {}
print(f"Completed global shift. Final feature shape for GA: {feat.shape}")

# --- 3. Define Primitive Signals from Features ---
print('--- Defining Primitive Signals ---')
primitive_signals = []
signal_series = {}
signal_id_counter = 0
for feature_id in feat.columns:
    s = feat[feature_id].replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty: continue

    is_boolean_like = set(s.unique()).issubset({0, 1, True, False})

    if is_boolean_like:
        if s.std() == 0: continue
        sig_id = f"SIG_{signal_id_counter}"; signal_id_counter += 1
        primitive_signals.append({'signal_id': sig_id, 'feature_id': feature_id, 'condition_type': 'boolean', 'operator': '==', 'value': True})
        signal_series[sig_id] = (s == True)
        continue

    if s.std() == 0: continue

    # Generate percentile signals
    for op, val in [('>', 0.8), ('<', 0.2)]:
        sig_id = f"SIG_{signal_id_counter}"; signal_id_counter += 1
        primitive_signals.append({'signal_id': sig_id, 'feature_id': feature_id, 'condition_type': 'percentile', 'operator': op, 'value': val})
        signal_series[sig_id] = s.rank(pct=True).apply(lambda x: x > val if op == '>' else x < val)

    # Generate z-score signals
    if not ('_z' in feature_id or 'zscore' in feature_id): # Only generate z-signals for non-z features
        rolling_std = s.rolling(60).std()
        valid_std_mask = rolling_std > 1e-9
        if not valid_std_mask.any(): continue

        z = pd.Series(np.nan, index=s.index)
        z[valid_std_mask] = (s - s.rolling(60).mean())[valid_std_mask] / rolling_std[valid_std_mask]

        for op, val in [('>', 1.5), ('<', -1.5)]:
            sig_id = f"SIG_{signal_id_counter}"; signal_id_counter += 1
            primitive_signals.append({'signal_id': sig_id, 'feature_id': feature_id, 'condition_type': 'z_score', 'operator': op, 'value': val})
            signal_series[sig_id] = z.apply(lambda x: x > val if op == '>' else x < val)

print(f"Defined {len(primitive_signals)} primitive signals.")

# --- Prepare Returns for Evaluation ---
price_cols_for_returns = [first_col_containing(t, 'PX_LAST') for t in TRADABLE_TICKERS if first_col_containing(t, 'PX_LAST')]
prices = raw[price_cols_for_returns].copy()
returns = {h: prices.pct_change(h).shift(-h) for h in [1, 3, 5, 10, 21]}

# --- 4. GENETIC ALGORITHM: Evolve Powerful Setups ---

# --- GENETIC ALGORITHM HELPERS ---
def get_setup_dna(setup):
    """Creates a unique, hashable identifier for a setup based on its signals."""
    return tuple(sorted([s['signal_id'] for s in setup['signal_definitions']]))

def crossover(parent1, parent2):
    """Creates a new child setup by combining DNA from two parents."""
    child_signals = [random.choice(parent1['signal_definitions']), random.choice(parent2['signal_definitions'])]
    if len(parent1['signal_definitions']) > 1 and len(parent2['signal_definitions']) > 1:
        child_signals.append(random.choice(parent1['signal_definitions']))
        child_signals.append(random.choice(parent2['signal_definitions']))
    child_signals = list({s['signal_id']: s for s in child_signals}.values())
    if len(child_signals) > max(SETUP_LENGTHS_TO_EXPLORE):
        child_signals = random.sample(child_signals, max(SETUP_LENGTHS_TO_EXPLORE))
    return {'id': 'child', 'signal_definitions': child_signals}


def mutate(setup, all_signal_ids, mutation_rate):
    """Randomly changes one signal in a setup's DNA."""
    if random.random() < mutation_rate:
        idx_to_mutate = random.randint(0, len(setup['signal_definitions']) - 1)
        new_signal_id = random.choice(all_signal_ids)
        new_signal_def = next(p for p in primitive_signals if p['signal_id'] == new_signal_id)
        if new_signal_def['signal_id'] not in [s['signal_id'] for s in setup['signal_definitions']]:
            setup['signal_definitions'][idx_to_mutate] = new_signal_def
    return setup

def non_dominated_sort(population):
    """Performs non-dominated sorting on the population."""
    for ind1 in population:
        ind1['domination_count'] = 0
        ind1['dominated_solutions'] = []
        for ind2 in population:
            if ind1 is ind2:
                continue
            is_dominant = (
                (ind1['objectives'][0] >= ind2['objectives'][0] and
                 ind1['objectives'][1] >= ind2['objectives'][1] and
                 ind1['objectives'][2] >= ind2['objectives'][2]) and
                (ind1['objectives'][0] > ind2['objectives'][0] or
                 ind1['objectives'][1] > ind2['objectives'][1] or
                 ind1['objectives'][2] > ind2['objectives'][2])
            )
            if is_dominant:
                ind1['dominated_solutions'].append(ind2)
            elif (
                (ind2['objectives'][0] >= ind1['objectives'][0] and
                 ind2['objectives'][1] >= ind1['objectives'][1] and
                 ind2['objectives'][2] >= ind1['objectives'][2]) and
                (ind2['objectives'][0] > ind1['objectives'][0] or
                 ind2['objectives'][1] > ind1['objectives'][1] or
                 ind2['objectives'][2] > ind1['objectives'][2])
            ):
                ind1['domination_count'] += 1

    fronts = []
    rank = 1
    front1 = [ind for ind in population if ind['domination_count'] == 0]
    for ind in front1:
        ind['rank'] = rank
    current_front = front1
    while current_front:
        fronts.append(current_front)
        next_front = []
        for ind1 in current_front:
            for ind2 in ind1['dominated_solutions']:
                ind2['domination_count'] -= 1
                if ind2['domination_count'] == 0:
                    ind2['rank'] = rank + 1
                    next_front.append(ind2)
        rank += 1
        current_front = next_front
    sorted_population = [ind for front in fronts for ind in front]
    return sorted_population

def calculate_crowding_distance(front):
    """Calculates the crowding distance for each individual in a Pareto front."""
    if not front: return
    num_objectives = len(front[0]['objectives'])
    for ind in front:
        ind['crowding_distance'] = 0
    for i in range(num_objectives):
        front.sort(key=lambda x: x['objectives'][i])
        front[0]['crowding_distance'] = float('inf')
        front[-1]['crowding_distance'] = float('inf')
        obj_min = front[0]['objectives'][i]
        obj_max = front[-1]['objectives'][i]
        if obj_max == obj_min: continue
        for j in range(1, len(front) - 1):
            front[j]['crowding_distance'] += (front[j + 1]['objectives'][i] - front[j - 1]['objectives'][i]) / (obj_max - obj_min)

def selection_operator(population, k=2):
    """Selects a parent using binary tournament selection."""
    tournament_contenders = random.sample(population, k)
    contender1, contender2 = tournament_contenders[0], tournament_contenders[1]
    if contender1['rank'] < contender2['rank']: return contender1
    elif contender2['rank'] < contender1['rank']: return contender2
    else:
        if contender1['crowding_distance'] > contender2['crowding_distance']: return contender1
        else: return contender2

def generate_english_description(setup_id, signal_defs, feature_specs_list):
    clauses = [];
    for s_def in signal_defs:
        feat_name = next((f_spec['display_name'] for f_spec in feature_specs_list if f_spec['unique_id'] == s_def['feature_id']), s_def.get('feature_id', 'unknown_feature'))
        if s_def['condition_type'] == 'boolean': clauses.append(f"{feat_name} is true")
        elif s_def['condition_type'] == 'percentile':
            level = "is very high" if s_def['operator'] == '>' else "is very low"; clauses.append(f"{feat_name} {level}")
        else:
            level = "surges unexpectedly" if s_def['operator'] == '>' else "drops sharply"; clauses.append(f"{feat_name} {level}")
    description = f"When {clauses[0]}"
    if len(clauses) > 1: description += f" and {' and '.join(clauses[1:])}"
    direction_score = sum(1 if s['operator'] == '>' else -1 for s in signal_defs); bias = 'a bullish' if direction_score > 0 else 'a bearish' if direction_score < 0 else 'an uncertain'
    description += f", it may indicate {bias} outlook."
    return {'setup_id': setup_id, 'description': description, 'explained_description': "DEPRECATED"}

def evaluate_one_setup(setup, returns_dict):
    """Evaluates a single setup and returns objectives and trigger dates."""
    sid, signal_defs = setup['id'], setup['signal_definitions']
    CAP_VALUE = 100.0 # Set a reasonable cap for risk metrics

    try:
        mask = functools.reduce(lambda a, b: a & b, [signal_series[s['signal_id']] for s in signal_defs])
        dates = mask[mask].index
    except (KeyError, TypeError):
        return {'id': sid, 'signal_definitions': signal_defs, 'objectives': (-99, -99, 0), 'metrics_by_ticker': {}, 'trigger_dates': pd.Index([])}

    support = len(dates)
    if support < MIN_INITIAL_SUPPORT_FILTER:
        return {'id': sid, 'signal_definitions': signal_defs, 'objectives': (-99, -99, support), 'metrics_by_ticker': {}, 'trigger_dates': pd.Index([])}

    direction_score = sum(1 if s['operator'] == '>' else -1 for s in signal_defs if s['condition_type'] != 'boolean')
    if direction_score == 0 and any(s['condition_type'] != 'boolean' for s in signal_defs):
        return {'id': sid, 'signal_definitions': signal_defs, 'objectives': (-99, -99, support), 'metrics_by_ticker': {}, 'trigger_dates': dates}
    elif all(s['condition_type'] == 'boolean' for s in signal_defs):
        direction_score = 1
    entry_direction = 'long' if direction_score > 0 else 'short'

    all_sortinos, all_calmars, metrics_by_ticker = [], [], {}
    perf_horizon = 10

    for tk_col in price_cols_for_returns:
        tk_symbol = next((ticker for ticker in TRADABLE_TICKERS if tk_col.startswith(ticker)), "Unknown")
        r_ticker = returns_dict[perf_horizon][tk_col].reindex(dates).dropna()
        if entry_direction == 'short': r_ticker = -r_ticker
        if len(r_ticker) >= 5 and r_ticker.std() > 1e-9:
            sortino, calmar = calculate_sortino_ratio(r_ticker), calculate_calmar_ratio(r_ticker)
            all_sortinos.append(sortino)
            all_calmars.append(calmar)
            metrics_by_ticker[tk_symbol] = {'sortino': sortino, 'calmar': calmar}

    median_sortino = np.nan_to_num(np.median(all_sortinos) if all_sortinos else -99, nan=-99.0, posinf=CAP_VALUE, neginf=-999.0)
    median_calmar = np.nan_to_num(np.median(all_calmars) if all_calmars else -99, nan=-99.0, posinf=CAP_VALUE, neginf=-999.0)

    # Manually cap the values to the CAP_VALUE
    median_sortino = min(median_sortino, CAP_VALUE)
    median_calmar = min(median_calmar, CAP_VALUE)

    return {
        'id': sid, 'signal_definitions': signal_defs, 'objectives': (median_sortino, median_calmar, support),
        'metrics_by_ticker': metrics_by_ticker, 'entry_direction': entry_direction, 'trigger_dates': dates,
        'first_trigger_date': dates.min() if not dates.empty else pd.NaT,
        'last_trigger_date': dates.max() if not dates.empty else pd.NaT
    }

# --- Robust Initial Population Generator (v3 - Pre-Validation) ---
print('\n--- GENETIC ALGORITHM: Creating Initial Population (Generation 0) ---')
all_signal_ids = [s['signal_id'] for s in primitive_signals]
current_population = []
setup_id_counter = 0
existing_dna = set()
perf_horizon = 21

def is_setup_viable(signal_defs, min_trades=5):
    """A lightweight pre-evaluation function to check if a setup is truly viable."""
    try:
        mask = functools.reduce(lambda a, b: a & b, [signal_series[s['signal_id']] for s in signal_defs])
        dates = mask[mask].index
        if len(dates) < MIN_INITIAL_SUPPORT_FILTER: return False
        for tk_col in price_cols_for_returns:
            r_ticker = returns[perf_horizon][tk_col].reindex(dates).dropna()
            if len(r_ticker) >= min_trades: return True
        return False
    except (KeyError, TypeError):
        return False

print("Seeding with pre-validated single and pair setups...")
num_to_create = int(POPULATION_SIZE * 0.8)
max_attempts = len(primitive_signals) * 5
attempts = 0

for p_signal in primitive_signals:
    if len(current_population) >= num_to_create: break
    if is_setup_viable([p_signal]):
        setup = {'id': f'S{setup_id_counter:04d}', 'signal_definitions': [p_signal]}
        dna = get_setup_dna(setup)
        if dna not in existing_dna:
            current_population.append(setup); existing_dna.add(dna); setup_id_counter += 1

while len(current_population) < num_to_create and attempts < max_attempts:
    attempts += 1
    p_signal_1, p_signal_2 = random.choice(primitive_signals), random.choice(primitive_signals)
    if p_signal_1['signal_id'] == p_signal_2['signal_id']: continue
    sig_defs = [p_signal_1, p_signal_2]
    if is_setup_viable(sig_defs):
        setup = {'id': f'S{setup_id_counter:04d}', 'signal_definitions': sig_defs}
        dna = get_setup_dna(setup)
        if dna not in existing_dna:
            current_population.append(setup); existing_dna.add(dna); setup_id_counter += 1

print(f"  - Created {len(current_population)} pre-validated setups.")

print("Filling remainder of population with pre-validated random setups...")
max_attempts, attempts = POPULATION_SIZE * 100, 0
while len(current_population) < POPULATION_SIZE and attempts < max_attempts:
    attempts += 1
    k = random.choice(SETUP_LENGTHS_TO_EXPLORE)
    sig_id_list = random.sample(all_signal_ids, k)
    sig_defs = [p for p in primitive_signals if p['signal_id'] in sig_id_list]
    dna = get_setup_dna({'signal_definitions': sig_defs})
    if dna in existing_dna: continue
    if is_setup_viable(sig_defs):
        temp_setup = {'id': f'S{setup_id_counter:04d}', 'signal_definitions': sig_defs}
        current_population.append(temp_setup); existing_dna.add(dna); setup_id_counter += 1

if attempts >= max_attempts: print(f"Warning: Population filling stopped after {max_attempts} attempts.")
if not current_population: raise SystemExit("FATAL: Could not create any viable setups.")
print(f"Created initial population of {len(current_population)} guaranteed viable setups.")

# --- The Main Evolutionary Loop (Tuned) ---
hall_of_fame = []
for generation in range(NUM_GENERATIONS):
    print(f"\n--- Evaluating Generation {generation + 1}/{NUM_GENERATIONS} ---")
    evaluated_population = Parallel(n_jobs=-1)(delayed(evaluate_one_setup)(setup, returns) for setup in current_population)
    combined_population = evaluated_population

    if generation > 0:
        children = []
        ranked_population = non_dominated_sort(evaluated_population)
        if ranked_population:
            front_num = 1
            while True:
                current_front = [ind for ind in ranked_population if ind.get('rank') == front_num]
                if not current_front: break
                calculate_crowding_distance(current_front)
                front_num += 1
            while len(children) < POPULATION_SIZE:
                parent1, parent2 = selection_operator(ranked_population), selection_operator(ranked_population)
                child = crossover(parent1, parent2)
                child = mutate(child, all_signal_ids, MUTATION_RATE)
                child['id'] = f'S{setup_id_counter:04d}'; setup_id_counter += 1
                children.append(child)
            evaluated_children = Parallel(n_jobs=-1)(delayed(evaluate_one_setup)(setup, returns) for setup in children)
            combined_population += evaluated_children

    if generation > 0 and combined_population:
        phenotype_dict = {tuple(ind['trigger_dates']): ind for ind in combined_population if ind.get('trigger_dates') is not None and not ind['trigger_dates'].empty}
        unique_phenotype_population = list(phenotype_dict.values())
    else:
        unique_phenotype_population = combined_population

    if not unique_phenotype_population:
        print("Population extinct. Stopping."); break

    sorted_population = non_dominated_sort(unique_phenotype_population)
    next_generation_population = []
    front_num = 1
    while len(next_generation_population) < POPULATION_SIZE:
        current_front = [ind for ind in sorted_population if ind['rank'] == front_num]
        if not current_front: break
        calculate_crowding_distance(current_front)
        if len(next_generation_population) + len(current_front) <= POPULATION_SIZE:
            next_generation_population.extend(current_front)
        else:
            current_front.sort(key=lambda x: x['crowding_distance'], reverse=True)
            num_needed = POPULATION_SIZE - len(next_generation_population)
            next_generation_population.extend(current_front[:num_needed])
        front_num += 1

    if not next_generation_population:
        print(f"Warning: Could not form next generation. Stopping."); break
    current_population = next_generation_population

    current_best_front = [ind for ind in sorted_population if ind['rank'] == 1]
    if current_best_front:
        hall_of_fame_candidates = non_dominated_sort(hall_of_fame + current_best_front)
        hof_pheno_dict = {tuple(ind['trigger_dates']): ind for ind in hall_of_fame_candidates if ind.get('trigger_dates') is not None and not ind['trigger_dates'].empty}
        hall_of_fame = [ind for ind in hof_pheno_dict.values() if ind.get('rank') == 1]

    if hall_of_fame:
        hall_of_fame.sort(key=lambda x: x['objectives'][0], reverse=True)
        best_of_gen = hall_of_fame[0]
        print(f"Generation {generation + 1} Complete. Unique Phenotypes: {len(unique_phenotype_population)}. Hall of Fame: {len(hall_of_fame)}. Best: (S:{best_of_gen['objectives'][0]:.2f}, C:{best_of_gen['objectives'][1]:.2f}, Sup:{best_of_gen['objectives'][2]})")
    else:
        print(f"Generation {generation + 1} Complete. No valid solutions in Hall of Fame.")

# --- Final Evaluation and Multi-Objective Report ---
print("\n--- Genetic Algorithm Complete. Generating Final Report from Hall of Fame ---")
if not hall_of_fame:
    print("Discovery complete. The Hall of Fame is empty; no valid setups were found.")
else:
    final_pareto_front_df = pd.DataFrame(hall_of_fame).drop_duplicates(subset=['id'], keep='first')
    final_pareto_front = final_pareto_front_df.to_dict('records')
    print(f"Final unique Pareto Front contains {len(final_pareto_front)} non-dominated solutions.")
    print(f"\n--- Performing deep dive on {len(final_pareto_front)} unique solutions ---")

    all_trade_ledger_rows, all_description_records, summary_rows = [], [], []

    for setup_solution in final_pareto_front:
        setup_id, setup_def = setup_solution['id'], setup_solution['signal_definitions']
        all_description_records.append(generate_english_description(setup_id, setup_def, feature_specs))
        dates = pd.Index(setup_solution['trigger_dates'])

        best_ticker, best_sortino = "N/A", -999
        if setup_solution.get('metrics_by_ticker'):
            for ticker, metrics in setup_solution['metrics_by_ticker'].items():
                if metrics.get('sortino', -999) > best_sortino:
                    best_sortino, best_ticker = metrics['sortino'], ticker

        recency_sharpe = np.nan
        if len(dates) >= RECENCY_WINDOW:
            recent_dates, h, recent_sharpes = dates[-RECENCY_WINDOW:], 10, []
            for tk_symbol in TRADABLE_TICKERS:
                 price_col_name = first_col_containing(tk_symbol, 'PX_LAST')
                 if price_col_name:
                    r_recent = returns[h][price_col_name].reindex(recent_dates).dropna()
                    if setup_solution['entry_direction'] == 'short': r_recent = -r_recent
                    if r_recent.std() > 1e-9 and len(r_recent) > 2:
                        recent_sharpes.append((r_recent.mean() / r_recent.std()) * np.sqrt(252/h))
            if recent_sharpes: recency_sharpe = np.nanmedian(recent_sharpes)

        summary_rows.append({
            'setup_id': setup_id, 'rank': setup_solution['rank'], 'best_performing_ticker': best_ticker,
            'obj_sortino': setup_solution['objectives'][0], 'obj_calmar': setup_solution['objectives'][1],
            'obj_support': setup_solution['objectives'][2], 'entry_direction': setup_solution['entry_direction'],
            'first_trigger_date': setup_solution.get('first_trigger_date'), 'last_trigger_date': setup_solution.get('last_trigger_date'),
            'recency_sharpe': recency_sharpe,
        })

        for tk_symbol in TRADABLE_TICKERS:
            tk_col = first_col_containing(tk_symbol, 'PX_LAST')
            if not tk_col: continue
            ivol_col = (first_col_containing(tk_symbol, '30_Day_Call_Implied_Volatility') or first_col_containing(tk_symbol, 'IVOL_SIGMA'))
            ivol_series = raw[ivol_col].reindex(dates) if ivol_col and ivol_col in raw.columns else pd.Series(np.nan, index=dates)
            entry_px_series = raw[tk_col].reindex(dates)
            for d in dates:
                entry_px = entry_px_series.loc[d]
                ivol = ivol_series.loc[d] if not ivol_series.empty and pd.notna(d) and d in ivol_series.index else np.nan
                for h_opt in OPTION_SIM_HORIZONS_DAYS:
                    exit_date = d + pd.Timedelta(days=h_opt)
                    future_px_series = raw.loc[raw.index >= exit_date, tk_col]
                    final_exit_px = future_px_series.iloc[0] if not future_px_series.empty else np.nan
                    pnl_detail = simulate_option_pnl_detailed(entry_px, final_exit_px, ivol, h_opt, setup_solution['entry_direction'])
                    all_trade_ledger_rows.append({'setup_id': setup_id, 'trigger_date': d, 'target_ticker': tk_symbol, 'horizon_days': h_opt, **pnl_detail})

    summary_df = pd.DataFrame(summary_rows)
    trade_ledger_df = pd.DataFrame(all_trade_ledger_rows)
    description_df = pd.DataFrame(all_description_records).drop_duplicates(subset=['setup_id'])

    if not trade_ledger_df.empty:
        for h_opt in OPTION_SIM_HORIZONS_DAYS:
            pnl_dollars_map = trade_ledger_df[trade_ledger_df['horizon_days'] == h_opt].groupby('setup_id')['pnl_dollars'].mean()
            pnl_pct_map = trade_ledger_df[trade_ledger_df['horizon_days'] == h_opt].groupby('setup_id')['pnl_pct'].mean()
            summary_df[f'avg_option_pnl_dollars_{h_opt}d'] = summary_df['setup_id'].map(pnl_dollars_map)
            summary_df[f'avg_option_pnl_pct_{h_opt}d'] = summary_df['setup_id'].map(pnl_pct_map)

    numeric_cols = summary_df.select_dtypes(include=np.number).columns
    summary_df[numeric_cols] = summary_df[numeric_cols].round(4)
    if not trade_ledger_df.empty:
        trade_ledger_df = trade_ledger_df.round({'pnl_pct': 4, 'pnl_dollars': 2})

    final_summary_df = pd.merge(summary_df, description_df[['setup_id', 'description']], on='setup_id', how='left')
    final_summary_df.sort_values(by=['obj_sortino', 'obj_calmar', 'obj_support'], ascending=[False, False, False], inplace=True)

    print('\n--- Generating Final Output Files ---')
    final_summary_df.to_csv('pareto_front_summary.csv', index=False)
    print("Saved 'pareto_front_summary.csv'")
    trade_ledger_df.to_csv('pareto_front_trade_ledger.csv', index=False)
    print("Saved 'pareto_front_trade_ledger.csv'")

    print("\n--- Generating Final JSON and Summary ---")
    top_setups_for_json = final_summary_df.copy()
    top_setups_for_json.replace({np.nan: None, pd.NaT: None}, inplace=True)
    date_cols = ['first_trigger_date', 'last_trigger_date']
    for col in date_cols:
        if col in top_setups_for_json.columns:
            top_setups_for_json[col] = pd.to_datetime(top_setups_for_json[col], errors='coerce').dt.strftime('%Y-%m-%d')
            top_setups_for_json[col].replace({pd.NaT: None}, inplace=True)
    top_setups_json = top_setups_for_json.to_dict(orient='records')
    with open('pareto/pareto_front_setups.json', 'w') as f:
        json.dump(top_setups_json, f, indent=2)
    print("Saved 'pareto_front_setups.json'")

    print('\nDiscovery complete.')
    print("\nSolutions on the Final Pareto Front (sorted by Sortino):")
    display_cols = ['setup_id', 'rank', 'best_performing_ticker', 'obj_sortino', 'obj_calmar', 'obj_support', 'recency_sharpe', 'description']
    print(final_summary_df[display_cols].head(15).to_string())


--- RUNNING IN SINGLE TICKER MODE FOR: QQQ US Equity ---
Loading raw workbooks…
Raw shape: (1978, 568)

Identified all relevant prefixes/tickers for feature engineering: 33

--- Defining ALL Feature Specifications ---
Defined 3359 total feature specifications.
--- Building raw feature set... ---
Could not build feature 'zscore_PX_LAST_30d__AAPL US Equity': min_periods 60 must be <= window 30
Could not build feature 'zscore_IVOL_SIGMA_30d__AAPL US Equity': min_periods 60 must be <= window 30
Could not build feature 'zscore_PX_LAST_30d__ARKK US Equity': min_periods 60 must be <= window 30
Could not build feature 'zscore_IVOL_SIGMA_30d__ARKK US Equity': min_periods 60 must be <= window 30
Could not build feature 'zscore_PX_LAST_30d__BSPGCPUS Index': min_periods 60 must be <= window 30
Could not build feature 'zscore_PX_LAST_30d__CO1 Comdty': min_periods 60 must be <= window 30
Could not build feature 'zscore_PX_LAST_30d__CONSSENT Index': min_periods 60 must be <= window 30
Could not buil