In [None]:
# Feature Engineering + Discovery Engine with custom features
import pandas as pd
import numpy as np
import random
import warnings
import functools
import os
from scipy.stats import linregress
import itertools

# Import joblib at the very top for parallel processing
from joblib import Parallel, delayed

warnings.filterwarnings('ignore')

# --- DEFINITIONS AND CONFIGURATION ---
# Define the explicit list of tradable tickers (full sheet names from your All_Tickers copy.xlsx)
TRADABLE_TICKERS = [
    'QQQ US Equity', 'SPY US Equity', 'XLK US Equity', 'XLF US Equity',
    'XLE US Equity', 'ARKK US Equity', 'VIX Index', 'GLD US Equity',
    'NBIS US Equity', 'LLY US Equity', 'TSLA US Equity', 'AAPL US Equity',
    'NVDA US Equity'
]

# Define the file paths
MAIN_DATA_FILE = 'All_Tickers copy.xlsx'
MACRO_DATA_FILE = 'Macro_tickers_no_nan_cols.xlsx'

# Setup Generation Configuration
SETUP_LENGTHS_TO_EXPLORE = [2, 3, 4] # Explore setups with 2 and 3 conditions for faster execution
MIN_INITIAL_SUPPORT_FILTER = 10 # Minimum number of trigger days for a setup to be considered

# Option Simulation Configuration
OPTION_SIM_HORIZON_DAYS = 10 # Days to expiration for simulated options
RISK_FREE_RATE = 0.01 # Annual risk-free rate for option premium estimation

# --- END DEFINITIONS AND CONFIGURATION ---


print('Loading raw workbooks …')
raw = None # Initialize raw to None

# --- Custom Data Loading Function to handle sheet names as prefixes ---
def load_and_merge_excel(file_path, existing_df=None):
    """Loads an Excel file, prepends sheet names to columns (except Date), and merges."""
    try:
        xls = pd.ExcelFile(file_path)
        current_df = existing_df.copy() if existing_df is not None else None

        for sh_name in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name=sh_name)
            # Prepend sheet name to all columns except 'Date'
            df.columns = [f"{sh_name}_{col}" if col != 'Date' else col for col in df.columns]

            if current_df is None:
                current_df = df
            else:
                # Use outer merge to keep all dates from all sheets
                current_df = current_df.merge(df, on='Date', how='outer')
        return current_df
    except FileNotFoundError:
        print(f"Error: '{file_path}' not found. Please ensure the file is in the correct directory.")
        return existing_df
    except Exception as e:
        print(f"An unexpected error occurred during Excel loading of '{file_path}': {e}")
        return existing_df

# Load main data
raw = load_and_merge_excel(MAIN_DATA_FILE)

# Load macro data if main data was loaded successfully
if raw is not None and not raw.empty:
    raw = load_and_merge_excel(MACRO_DATA_FILE, existing_df=raw)
else:
    print("Main data could not be loaded, skipping macro data loading.")
    raw = pd.DataFrame({'Date': pd.to_datetime([])}) # Fallback to empty DF if main load failed

# Final cleaning and sorting
if raw is not None and not raw.empty:
    raw = raw.sort_values('Date').reset_index(drop=True)
    raw.fillna(method='ffill', inplace=True)

    # --- FIX: Handle duplicate dates and set 'Date' as index ---
    if 'Date' in raw.columns:
        raw['Date'] = pd.to_datetime(raw['Date'])
        raw = raw.drop_duplicates(subset=['Date'], keep='last') # Drop duplicates before setting index
        raw.set_index('Date', inplace=True) # Set 'Date' as the DataFrame's index
        raw.index = pd.to_datetime(raw.index) # Ensure it's a DatetimeIndex
        raw.sort_index(inplace=True) # Ensure sorted index for .reindex(method='nearest')
    else:
        print("Warning: 'Date' column not found to set as index. Some lookups might be less efficient or prone to error.")
    # --- END FIX ---

else:
    print("No data loaded. Raw DataFrame is empty.")
    raw = pd.DataFrame({'Date': pd.to_datetime([])}) # Ensure raw is a DataFrame

print('Raw shape:', raw.shape)
print('Example columns after loading:')
print(raw.columns[:5].tolist()) # 'Date' is now the index, so it won't be listed here


# --- Dynamic Ticker Identification Refined ---
# all_tickers will now refer to ALL unique ticker prefixes found in columns after loading,
# Adjusted to access columns correctly, as 'Date' is now the index.
all_column_prefixes = sorted(list(set([c.split('_')[0] for c in raw.columns if '_' in c])))
COMMON_FEATURE_PREFIXES = ['Last', 'Open', 'High', 'Low', 'VWAP', 'Volume', 'IVOL', 'Implied', 'Total',
                           '30', '10', '60', 'Hist.', '1st', 'Put', 'Dates', 'CHG', 'FFA', 'INJCJC',
                           'NFP', 'JOBS', 'CPI', 'CTII10', 'LF94TRUU', 'SPX', 'USSW10', 'MLCX3CRT',
                           'FARBAST', 'BSPGCPUS', 'SPCSUSA', 'SPCS20SM', 'CONSSENT']
actual_ticker_prefixes = [p for p in all_column_prefixes if p not in COMMON_FEATURE_PREFIXES]
all_tickers = sorted(list(set(TRADABLE_TICKERS + actual_ticker_prefixes)))

print(f'\nIdentified all relevant prefixes/tickers for feature engineering: {all_tickers}')
print(f'Actual tradable tickers for returns: {TRADABLE_TICKERS}')


# --- Revised Helper functions (using raw.index as source) ---
def first_col_containing(ticker_full_name, substr=''):
    """
    Finds the first column name in raw that matches the pattern 'ticker_full_name_substr'.
    Handles cases where substr might be 'PX_LAST' and the actual column name is 'ticker_full_name_Last_Price_PX_LAST'.
    """
    if substr == 'PX_LAST':
        potential_col_name_long_price = f"{ticker_full_name}_Last_Price_PX_LAST"
        if potential_col_name_long_price in raw.columns:
            return potential_col_name_long_price
        potential_col_name_short_px = f"{ticker_full_name}_PX_LAST"
        if potential_col_name_short_px in raw.columns:
            return potential_col_name_short_px
    for c in raw.columns:
        if c.startswith(ticker_full_name) and substr in c:
            return c
    return None

def safe_series(col_name):
    """Returns a column as a Series, or an empty Series if column does not exist."""
    return raw[col_name] if col_name and col_name in raw.columns else pd.Series(index=raw.index, dtype=float)

# --- Helper function for Block Bootstrap Sharpe ---
def block_bootstrap_sharpe(returns_series, block_size, num_iterations=1000, annualize=True, trading_days_per_year=252):
    """
    Calculates the Sharpe Ratio using block bootstrapping to account for serial correlation.
    """
    returns_series = returns_series.dropna()
    if len(returns_series) < block_size or len(returns_series) < 2:
        return 0.0, 0.0, 0.0

    sharpes = []
    blocks = []
    for i in range(0, len(returns_series), block_size):
        block = returns_series.iloc[i : i + block_size]
        if not block.empty:
            blocks.append(block)

    if not blocks:
        return 0.0, 0.0, 0.0

    n_blocks_to_sample = int(np.ceil(len(returns_series) / block_size))

    for _ in range(num_iterations):
        resampled_returns_list = []
        sampled_blocks_indices = np.random.choice(len(blocks), n_blocks_to_sample, replace=True)
        for idx in sampled_blocks_indices:
            resampled_returns_list.append(blocks[idx])

        resampled_returns = pd.concat(resampled_returns_list).iloc[:len(returns_series)]

        if resampled_returns.std() > 1e-9:
            daily_sharpe = resampled_returns.mean() / resampled_returns.std()
            if annualize:
                sharpes.append(daily_sharpe * np.sqrt(trading_days_per_year))
            else:
                sharpes.append(daily_sharpe)
        else:
            sharpes.append(0.0)

    sharpes_sorted = sorted(sharpes)
    if not sharpes_sorted:
        return 0.0, 0.0, 0.0

    median_sharpe = np.median(sharpes_sorted)
    lower_ci = sharpes_sorted[int(0.05 * num_iterations)]
    upper_ci = sharpes_sorted[int(0.95 * num_iterations)]

    return median_sharpe, lower_ci, upper_ci


# --- Option Payoff Simulation Helpers ---
def estimate_atm_premium(S, ivol, T_days, r_annual=RISK_FREE_RATE, option_type='call'):
    """Estimates ATM option premium using a simplified Black-Scholes like approach."""
    if S <= 0 or ivol <= 0 or T_days <= 0:
        return 0.001
    T_years = T_days / 252.0
    premium_estimate = 0.4 * S * ivol * np.sqrt(T_years)
    return max(premium_estimate, 0.001)

def simulate_option_pnl(current_price, future_price, ivol_at_entry, horizon_days, entry_direction):
    """Simulates PnL for buying a simple ATM call or put."""
    if pd.isna(current_price) or pd.isna(future_price) or pd.isna(ivol_at_entry):
        return np.nan
    strike = current_price
    if entry_direction == 'long':
        premium = estimate_atm_premium(current_price, ivol_at_entry, horizon_days, option_type='call')
        payoff = max(future_price - strike, 0)
        pnl = payoff - premium
    elif entry_direction == 'short':
        premium = estimate_atm_premium(current_price, ivol_at_entry, horizon_days, option_type='put')
        payoff = max(strike - future_price, 0)
        pnl = payoff - premium
    else:
        pnl = np.nan
    return pnl


# --- Fractional Differencing Helper ---
def frac_diff(series, d=0.5, window='full'):
    """Computes fractionally differenced series."""
    if not isinstance(series, pd.Series):
        series = pd.Series(series)
    weights = [1.]
    for k in range(1, series.shape[0]):
        weights.append(-weights[-1] * (d - k + 1) / k)
    weights = np.array(weights[::-1])
    output = pd.Series(index=series.index, dtype=float)
    for i in range(series.shape[0]):
        if window == 'full':
            start = 0
        else:
            start = max(0, i - window + 1)
        subset = series.iloc[start : i + 1]
        current_weights = weights[-(i - start + 1):]
        if len(subset) == len(current_weights):
            output.iloc[i] = np.dot(current_weights, subset)
    return output.dropna()


# --- Feature Engineering ---
print('\nEngineering custom features …')
feat = pd.DataFrame(index=raw.index) # Use raw.index directly as 'Date' is now the index

# 1. VOLATILITY-BASED FEATURES
print("  - Volatility-based features")
for ticker in TRADABLE_TICKERS:
    ivol_10d_col = first_col_containing(ticker, '10_Day_Call_Implied_Volatility_CALL_IMP_VOL_10D')
    ivol_30d_col = first_col_containing(ticker, '30_Day_Call_Implied_Volatility_CALL_IMP_VOL_30D')
    ivol_60d_col = first_col_containing(ticker, '60_Day_Call_Implied_Volatility_CALL_IMP_VOL_60D')
    if ivol_60d_col and ivol_10d_col:
        feat[f'{ticker}_IVOL_Term_Structure_Slope'] = safe_series(ivol_60d_col) - safe_series(ivol_10d_col)
    call_40d_ivol_col = first_col_containing(ticker, '1st_Month_Call_Imp_Vol_40_Delta_LIVE_1M_CALL_IMP_VOL_40DELTA_DFLT')
    put_50d_ivol_col = first_col_containing(ticker, '1st_Month_Put_Imp_Vol_50_Delta_LIVE_1M_PUT_IMP_VOL_50DELTA_DFLT')
    if call_40d_ivol_col and put_50d_ivol_col:
        feat[f'{ticker}_IVOL_Skew_Approx'] = safe_series(put_50d_ivol_col) - safe_series(call_40d_ivol_col)
    specific_ivol_suffixes = ['IVOL_SIGMA', 'IVOL_DELTA', 'IVOL_MONEYNESS', 'CALL_IMP_VOL_30D', 'CALL_IMP_VOL_10D', 'CALL_IMP_VOL_60D', 'PUT_IMP_VOL_30D', 'PUT_IMP_VOL_10D', 'PUT_IMP_VOL_60D']
    for ivol_suffix in specific_ivol_suffixes:
        col_name = first_col_containing(ticker, ivol_suffix)
        if col_name:
            if '60_Day_Call_Implied_Volatility_CALL_IMP_VOL_60D' in col_name and \
               first_col_containing(ticker, '10_Day_Call_Implied_Volatility_CALL_IMP_VOL_10D'):
                feat[f'{ticker}_IVOL_Call_Slope'] = safe_series(col_name) - \
                                                    safe_series(first_col_containing(ticker, '10_Day_Call_Implied_Volatility_CALL_IMP_VOL_10D'))
            diff = safe_series(col_name).diff()
            z = (diff - diff.rolling(30).mean()) / diff.rolling(30).std()
            feat[col_name+'_shock'] = (z > 2).astype(int)
            vol_col_name = first_col_containing(ticker, 'VOLUME')
            if vol_col_name:
                feat[col_name+'_div'] = safe_series(col_name) / safe_series(vol_col_name)
            else:
                feat[col_name+'_div'] = np.nan

# 2. DERIV FLOW & SENTIMENT
print("  - Deriv Flow & Sentiment features")
for ticker in all_tickers:
    pc_col = first_col_containing(ticker, 'PUT_CALL_VOLUME_RATIO_CUR_DAY')
    if pc_col:
        feat[pc_col+'_ema5']=safe_series(pc_col).ewm(span=5,adjust=False).mean()
    open_int_col = first_col_containing(ticker, 'OPEN_INT_TOTAL_CALL')
    if open_int_col:
        feat[open_int_col+'_chg3']=safe_series(open_int_col).pct_change(3)
    volm_col = first_col_containing(ticker, 'Volume_-_Realtime_VOLUME')
    if volm_col:
        feat[volm_col+'_z']=(safe_series(volm_col)-safe_series(volm_col).rolling(30).mean())/safe_series(volm_col).rolling(30).std()
for ticker in all_tickers:
    open_int_t_col = first_col_containing(ticker, 'OPEN_INT_TOTAL_CALL')
    ivol_10d_col = first_col_containing(ticker, '10_Day_Call_Implied_Volatility_CALL_IMP_VOL_10D')
    if open_int_t_col and ivol_10d_col:
        feat[f'{ticker}_smart_money_flag'] = ((safe_series(open_int_t_col).pct_change() > 0) & \
                                              (safe_series(ivol_10d_col).pct_change() > 0)).astype(int)

# 3. CROSS-ASSET CORRELATIONS
print("  - Cross-Asset Correlations")
correlation_universe = sorted(list(set(TRADABLE_TICKERS + [t for t in all_tickers if t in ['DXY Curncy', 'USGG10YR Index', 'SPX Index', 'CO1 Comdty', 'USGG2YR Index']])))
dynamic_pairs = []
for i in range(len(TRADABLE_TICKERS)):
    for j in range(i + 1, len(TRADABLE_TICKERS)):
        dynamic_pairs.append((TRADABLE_TICKERS[i], TRADABLE_TICKERS[j]))
dynamic_pairs.extend([
    ('SPY US Equity', 'VIX Index'),
    ('SPY US Equity', 'USGG10YR Index'),
    ('SPY US Equity', 'DXY Curncy'),
    ('SPY US Equity', 'CO1 Comdty'),
    ('SPY US Equity', 'USGG2YR Index'),
    ('VIX Index', 'USGG10YR Index'),
    ('VIX Index', 'DXY Curncy'),
    ('VIX Index', 'CO1 Comdty'),
])
dynamic_pairs = list(set(dynamic_pairs))
for t1,t2 in dynamic_pairs:
    p1=first_col_containing(t1,'PX_LAST'); p2=first_col_containing(t2,'PX_LAST')
    if p1 and p2:
        s1=safe_series(p1); s2=safe_series(p2)
        if not s1.empty and not s2.empty:
            aligned_data = pd.DataFrame({'s1': s1, 's2': s2}).dropna()
            if len(aligned_data) > 60:
                c20=aligned_data['s1'].rolling(20).corr(aligned_data['s2'])
                c60=aligned_data['s1'].rolling(60).corr(aligned_data['s2'])
                feat[f'{t1}_{t2}_c20']=c20
                feat[f'{t1}_{t2}_c60']=c60
                feat[f'{t1}_{t2}_cZ']=(c20-c20.rolling(60).mean())/c20.rolling(60).std()
                feat[f'{t1}_{t2}_cDelta']=c20-c60
                ret1 = s1.pct_change().dropna()
                ret2 = s2.pct_change().dropna()
                aligned_returns = pd.DataFrame({'ret1': ret1, 'ret2': ret2}).dropna()
                if not aligned_returns.empty and aligned_returns['ret2'].var() != 0:
                    rolling_beta = aligned_returns['ret1'].rolling(window=60).cov(aligned_returns['ret2']) / \
                                   aligned_returns['ret2'].rolling(window=60).var()
                    feat[f'{t1}_{t2}_rolling_beta'] = rolling_beta

# 4. MACRO TRIGGERS
print("  - Macro Trigger features")
dxy_px=first_col_containing('DXY Curncy','PX_LAST')
ust10_px=first_col_containing('USGG10YR Index','PX_LAST')
spy_px=first_col_containing('SPY US Equity','PX_LAST')
vix_px=first_col_containing('VIX Index','PX_LAST')
feat['MPI']=(safe_series(dxy_px).pct_change().rolling(3).sum()+safe_series(ust10_px).pct_change().rolling(3).sum()).shift(0)
feat['VIX_gt20']=(safe_series(vix_px)>20).astype(int)
feat['DXY_rising']=(safe_series(dxy_px).pct_change()>0).astype(int)
feat['SPY_below_MA20']=(safe_series(spy_px)<safe_series(spy_px).rolling(20).mean()).astype(int)
feat['fear_overdrive']=((feat['VIX_gt20']==1)&(feat['DXY_rising']==1)&(feat['SPY_below_MA20']==1)).astype(int)
xlk_px=first_col_containing('XLK US Equity','PX_LAST'); xle_px=first_col_containing('XLE US Equity','PX_LAST')
feat['sector_rotation']=safe_series(xlk_px).pct_change(5)-safe_series(xle_px).pct_change(5)

# --- NEW MACRO FEATURES from Macro_tickers_no_nan_cols.xlsx ---
ust2_px = first_col_containing('USGG2YR Index', 'PX_LAST')
if ust10_px and ust2_px:
    feat['UST10Y_2Y_Spread'] = safe_series(ust10_px) - safe_series(ust2_px)
    feat['UST10Y_2Y_Spread_chg'] = feat['UST10Y_2Y_Spread'].pct_change()
cpi_yoy_px = first_col_containing('CPI YOY Index', 'PX_LAST')
cpi_chng_px = first_col_containing('CPI CHNG Index', 'PX_LAST')
if cpi_yoy_px:
    feat['CPI_YOY_mom3'] = safe_series(cpi_yoy_px).pct_change(3)
    feat['CPI_YOY_z'] = (safe_series(cpi_yoy_px) - safe_series(cpi_yoy_px).rolling(12).mean()) / safe_series(cpi_yoy_px).rolling(12).std()
if cpi_chng_px:
    feat['CPI_CHNG_mom3'] = safe_series(cpi_chng_px).pct_change(3)
injcjc_px = first_col_containing('INJCJC Index', 'PX_LAST')
nfp_tch_px = first_col_containing('NFP TCH Index', 'PX_LAST')
jobs_us_equity_px = first_col_containing('JOBS US Equity', 'PX_LAST')
if injcjc_px:
    feat['INJCJC_shock'] = (safe_series(injcjc_px).diff() > safe_series(injcjc_px).diff().rolling(20).std() * 2).astype(int)
if nfp_tch_px:
    feat['NFP_TCH_mom3'] = safe_series(nfp_tch_px).pct_change(3)
if jobs_us_equity_px:
    feat['JOBS_US_Equity_mom3'] = safe_series(jobs_us_equity_px).pct_change(3)
ffa_comdty_px = first_col_containing('FFA Comdty', 'PX_LAST')
ctii10_govt_px = first_col_containing('CTII10 Govt', 'PX_LAST')
ussw10_curncy_px = first_col_containing('USSW10 Curncy', 'PX_LAST')
mlcx3crt_index_px = first_col_containing('MLCX3CRT Index', 'PX_LAST')
farbast_index_px = first_col_containing('FARBAST Index', 'PX_LAST')
bspgcpus_index_px = first_col_containing('BSPGCPUS Index', 'PX_LAST')
spcsusa_index_px = first_col_containing('SPCSUSA Index', 'PX_LAST')
spcs20sm_index_px = first_col_containing('SPCS20SM Index', 'PX_LAST')
conssent_index_px = first_col_containing('CONSSENT Index', 'PX_LAST')
lf94truu_index_vol30d = first_col_containing('LF94TRUU Index', 'VOLATILITY_30D')
if ffa_comdty_px: feat['FFA_Spread'] = safe_series(ffa_comdty_px) - safe_series(ust2_px)
if ctii10_govt_px: feat['CTII10_mom'] = safe_series(ctii10_govt_px).pct_change()
if ussw10_curncy_px: feat['USSW10_chg'] = safe_series(ussw10_curncy_px).pct_change()
if mlcx3crt_index_px: feat['MLCX3CRT_chg'] = safe_series(mlcx3crt_index_px).pct_change()
if farbast_index_px: feat['FARBAST_mom'] = safe_series(farbast_index_px).pct_change()
if bspgcpus_index_px: feat['BSPGCPUS_mom'] = safe_series(bspgcpus_index_px).pct_change()
if spcsusa_index_px: feat['SPCSUSA_mom'] = safe_series(spcsusa_index_px).pct_change()
if spcs20sm_index_px: feat['SPCS20SM_mom'] = safe_series(spcs20sm_index_px).pct_change()
if conssent_index_px: feat['CONSSENT_mom'] = safe_series(conssent_index_px).pct_change()
if lf94truu_index_vol30d: feat['LF94TRUU_Vol_Signal'] = safe_series(lf94truu_index_vol30d) / safe_series(lf94truu_index_vol30d).rolling(60).mean()


# 5. MOMENTUM / VOL FRACTALS
print("  - Momentum/Vol Fractals")
for tk in all_tickers:
    px_col=first_col_containing(tk,'PX_LAST')
    if not px_col: continue
    px=safe_series(px_col)
    mom5=px.pct_change(5)
    vol20=px.pct_change().rolling(20).std()
    feat[tk+'_mom5_vol20']=mom5/vol20
    ma=px.rolling(20).mean(); std=px.rolling(20).std()
    feat[tk+'_pctB']=(px-(ma-2*std))/(4*std)
    if not px.empty and len(px) > 100:
        try:
            feat[f'{tk}_frac_diff_0_5'] = frac_diff(px, d=0.5, window=100)
        except Exception as e:
            print(f"Warning: Could not compute fractional differencing for {tk}: {e}")
            feat[f'{tk}_frac_diff_0_5'] = np.nan


# 6. Shift +1 day and merge basic returns
feat=feat.shift(1)
panel = feat.copy()

# Prepare returns for evaluation based *only* on TRADABLE_TICKERS
price_cols_for_returns = []
for ticker_full_name in TRADABLE_TICKERS:
    px_col_name = first_col_containing(ticker_full_name, 'PX_LAST')
    if px_col_name:
        price_cols_for_returns.append(px_col_name)
    else:
        print(f"Warning: PX_LAST column not found for tradable ticker '{ticker_full_name}'. It will be excluded from return calculations.")

# --- MODIFIED: Prices DataFrame creation (raw is already indexed by Date) ---
prices=raw[price_cols_for_returns].copy()
returns={h:prices.pct_change(h).shift(-h) for h in [1,3,5,10,21]}


# Merge tradable returns into panel
for tk_full_name in TRADABLE_TICKERS:
    px_col_name = first_col_containing(tk_full_name, 'PX_LAST')
    if px_col_name and px_col_name in raw.columns:
        ret_series = raw[px_col_name].pct_change().shift(1).rename(f'{tk_full_name}_ret1')
        panel = pd.concat([panel, ret_series], axis=1)

print('Engineered panel shape:',panel.shape)


# --- Primitive Signal Generation ---
print('\nBuilding primitive signals from engineered panel …')
signals={}
# Helper functions for inferring signal direction and type (defined once globally)
def get_primitive_direction(primitive_name):
    """Infers the directional bias (+1 for long, -1 for short, 0 for neutral) of a primitive signal."""
    if '>80' in primitive_name or 'z>1.5' in primitive_name or 'ma5>ma20' in primitive_name or 'rising' in primitive_name: return 1
    elif '<20' in primitive_name or 'z<-1.name' in primitive_name or 'ma5<ma20' in primitive_name or 'below_MA' in primitive_name: return -1
    elif 'shock' in primitive_name: return 0
    elif 'slope' in primitive_name: return 0
    elif 'Spread' in primitive_name: return 0
    elif 'Vol_Signal' in primitive_name: return 0
    return 0

def get_primitive_signal_type(primitive_name):
    """Infers the broad category/type of a primitive signal based on its name."""
    if 'IVOL' in primitive_name or 'VIX' in primitive_name or 'vol' in primitive_name or '_shock' in primitive_name: return 'volatility'
    elif 'mom' in primitive_name or 'pctB' in primitive_name or '_chg' in primitive_name or '_ret' in primitive_name or 'rising' in primitive_name: return 'momentum'
    elif '_c20' in primitive_name or '_c60' in primitive_name or '_cZ' in primitive_name or '_cDelta' in primitive_name or '_beta' in primitive_name: return 'correlation'
    elif 'DXY' in primitive_name or 'USGG' in primitive_name or 'MPI' in primitive_name or 'fear_overdrive' in primitive_name or 'CPI' in primitive_name or 'INJCJC' in primitive_name or 'NFP' in primitive_name or 'JOBS' in primitive_name or 'FFA' in primitive_name or 'CTII10' in primitive_name or 'USSW10' in primitive_name or 'MLCX3CRT' in primitive_name or 'FARBAST' in primitive_name or 'BSPGCPUS' in primitive_name or 'SPCSUSA' in primitive_name or 'SPCS20SM' in primitive_name or 'CONSSENT' in primitive_name or 'LF94TRUU' in primitive_name: return 'macro'
    elif 'PUT_CALL_VOLUME_RATIO' in primitive_name or 'smart_money_flag' in primitive_name or 'Short_Interest_Ratio' in primitive_name: return 'sentiment'
    elif 'VOLUME' in primitive_name: return 'volume'
    elif 'OPEN_INT' in primitive_name: return 'open_interest'
    elif 'frac_diff' in primitive_name: return 'fractional_differencing'
    return 'other'

for col in panel.columns.drop('Date',errors='ignore'): # 'Date' is now index, so 'drop' might not find it
    s=panel[col]
    if pd.api.types.is_numeric_dtype(s):
        s_clean = s.replace([np.inf, -np.inf], np.nan).dropna()
        if s_clean.empty or s_clean.std() == 0: continue
        rank=s_clean.rank(pct=True)
        signals[col+'>80']=rank>0.8
        signals[col+'<20']=rank<0.2
        rolling_std_60 = s_clean.rolling(60).std()
        valid_std_mask = rolling_std_60 > 1e-9
        z = pd.Series(np.nan, index=s_clean.index, dtype=float)
        z[valid_std_mask] = (s_clean - s_clean.rolling(60).mean())[valid_std_mask] / rolling_std_60[valid_std_mask]
        signals[col+'_z>1.5']=z>1.5
        signals[col+'_z<-1.5']=z<-1.5
        ma5=s_clean.rolling(5).mean(); ma20=s_clean.rolling(20).mean()
        signals[col+'_ma5>ma20']=ma5>ma20
print('Total primitive signals:',len(signals))

# --- NEW ADDITION: Primitive Signal Pre-filtering for Combinations ---
#print(f"\nEvaluating individual primitive signal performance for pre-selection...")
#primitive_performances = {}
#for col_name, signal_series in signals.items():
    #if signal_series.sum() >= MIN_INITIAL_SUPPORT_FILTER:
        #primitive_trigger_dates = signal_series[signal_series].index
        #if not primitive_trigger_dates.empty:
         #   mean_returns_on_trigger = returns[10].loc[primitive_trigger_dates].mean(axis=1).dropna()
          #  if not mean_returns_on_trigger.empty and mean_returns_on_trigger.std() > 0.0001:
           #     block_size_for_primitive = min(10, max(1, len(mean_returns_on_trigger) // 2))
            #    median_sharpe, _, _ = block_bootstrap_sharpe(mean_returns_on_trigger, block_size=block_size_for_primitive)
             #   primitive_performances[col_name] = median_sharpe
            #else:
             #   primitive_performances[col_name] = 0.0
#TOP_N_PRIMITIVES = 20
#filtered_primitive_names = [name for name, sharpe in primitive_performances.items() if sharpe > 0.01]
#top_n_primitive_names = sorted(filtered_primitive_names, key=lambda x: primitive_performances[x], reverse=True)[:TOP_N_PRIMITIVES]
#primitive_names = top_n_primitive_names # This updates the list used for combinations
#print(f"Pre-selected {len(primitive_names)} primitive signals for combination generation (from {len(signals)} total).")

primitive_names = list(signals.keys())


# --- MODIFICATION: Setup Generation ---
all_candidate_setups = []
setup_id_counter = 1
print(f'\nGenerating setups with lengths {SETUP_LENGTHS_TO_EXPLORE} and applying initial support filter (min_support={MIN_INITIAL_SUPPORT_FILTER}) …')
for k in SETUP_LENGTHS_TO_EXPLORE:
    if k > len(primitive_names):
        print(f"Warning: Cannot generate setups of length {k} as only {len(primitive_names)} primitives are available. Skipping.")
        continue
    for conds_tuple in itertools.combinations(primitive_names, k):
        conds_list = list(conds_tuple)
        try:
            valid_combo_signals = [signals[c] for c in conds_list if c in signals]
            if not valid_combo_signals: continue
            current_mask = functools.reduce(lambda a,b: a & b, valid_combo_signals)
            current_support = current_mask.sum()
        except KeyError:
            current_support = 0
        if current_support >= MIN_INITIAL_SUPPORT_FILTER:
            setup = {
                'id': 'S' + str(setup_id_counter).zfill(4),
                'conds': conds_list,
                'setup_length': k,
                'setup_type': 'concurrent',
                'support': current_support
            }
            all_candidate_setups.append(setup)
            setup_id_counter += 1
setups = all_candidate_setups
print(f'Generated and filtered {len(setups)} candidate setups after initial support filter.')


# --- Parallel Setup Evaluation Function (defined ONCE) ---
def evaluate_one_setup(setup):
    """Helper function to evaluate a single setup for parallel processing."""
    summary_row_for_setup = {}
    trigger_records_for_this_setup = []
    sid=setup['id']; conds=setup['conds']
    valid_signals = [signals[c] for c in conds if c in signals]
    if not valid_signals: return None, None
    mask=functools.reduce(lambda a,b: a & b, valid_signals)
    dates=mask[mask].index
    support=len(dates)
    if support < MIN_INITIAL_SUPPORT_FILTER: return None, None

    direction_score = 0
    signal_type_counts = {}
    for cond in conds:
        direction_score += get_primitive_direction(cond)
        s_type = get_primitive_signal_type(cond)
        signal_type_counts[s_type] = signal_type_counts.get(s_type, 0) + 1
    entry_direction = 'mixed'
    if direction_score > 0: entry_direction = 'long'
    elif direction_score < 0: entry_direction = 'short'
    dominant_signal_type = 'unknown'
    if signal_type_counts: dominant_signal_type = max(signal_type_counts, key=signal_type_counts.get)

    first_trigger_date = dates.min() if not dates.empty else pd.NaT
    last_trigger_date = dates.max() if not dates.empty else pd.NaT
    perf={'setup_id':sid,'feature_conditions':'+'.join(conds),'support':support,
          'entry_direction': entry_direction,
          'dominant_signal_type': dominant_signal_type,
          'first_trigger_date': first_trigger_date,
          'last_trigger_date': last_trigger_date
         }

    for h,label in zip([3,5,10,21],['accuracy_3d','avg_return_5d','sharpe_10d','hit_rate_21d']):
        r = returns[h].reindex(dates)
        if r.empty:
            perf[label] = 0.0
            continue
        mean=r.mean(axis=1).dropna()
        if mean.empty:
            perf[label] = 0.0
            continue
        dir_correct=(mean>0).mean()
        if 'accuracy' in label: perf[label]=dir_correct
        elif 'avg' in label: perf[label]=mean.mean()
        elif 'sharpe' in label:
            bootstrap_block_size = min(h if h > 1 else 2, max(1, len(mean) // 2))
            if mean.std() > 0.0001 and len(mean) >= 2:
                median_sharpe, _, _ = block_bootstrap_sharpe(mean, block_size=bootstrap_block_size)
                perf[label] = median_sharpe
            else: perf[label] = 0.0
        elif 'hit' in label: perf[label]=(r>0).stack().mean()
    summary_row_for_setup = perf

    for d_idx, d in enumerate(dates):
        for tk_col_full_name in price_cols_for_returns:
            tk_symbol_for_log = tk_col_full_name.split('_PX_LAST')[0].split('_Last_Price')[0]
            current_px = raw[tk_col_full_name].get(d, np.nan)
            ivol_col_for_sim = first_col_containing(tk_symbol_for_log, f'{OPTION_SIM_HORIZON_DAYS}_Day_Call_Implied_Volatility_CALL_IMP_VOL_{OPTION_SIM_HORIZON_DAYS}D')
            if not ivol_col_for_sim: ivol_col_for_sim = first_col_containing(tk_symbol_for_log, '30_Day_Call_Implied_Volatility_CALL_IMP_VOL_30D')
            if not ivol_col_for_sim: ivol_col_for_sim = first_col_containing(tk_symbol_for_log, '60_Day_Call_Implied_Volatility_CALL_IMP_VOL_60D')
            ivol_at_entry = raw[ivol_col_for_sim].get(d, np.nan) if ivol_col_for_sim else np.nan
            option_pnl_10d = np.nan
            if not pd.isna(current_px) and not pd.isna(ivol_at_entry) and current_px > 0 and ivol_at_entry > 0:
                future_date_for_sim = d + pd.Timedelta(days=OPTION_SIM_HORIZON_DAYS)
                future_px_series = raw[tk_col_full_name].reindex([future_date_for_sim], method='nearest', tolerance=pd.Timedelta(days=5))
                future_px_for_sim = future_px_series.iloc[0] if not future_px_series.empty and not pd.isna(future_px_series.iloc[0]) else np.nan
                if not pd.isna(future_px_for_sim):
                    option_pnl_10d = simulate_option_pnl(current_px, future_px_for_sim, ivol_at_entry, OPTION_SIM_HORIZON_DAYS, entry_direction)
            ret_vals={h:returns[h][tk_col_full_name].get(d, np.nan) if tk_col_full_name in returns[h].columns else np.nan for h in [1,3,5,10,21]}
            trigger_records_for_this_setup.append({'date':d,'ticker':tk_symbol_for_log,'setup_id':sid,'matched':1,
                                    'return_1d':ret_vals[1],'return_3d':ret_vals[3],'return_5d':ret_vals[5],
                                    'return_10d':ret_vals[10],'return_21d':ret_vals[21],
                                    'option_pnl_10d': option_pnl_10d})
    return summary_row_for_setup, trigger_records_for_this_setup


# --- Parallel processing with joblib (Executed ONCE) ---
summary_rows=[]
trigger_records=[]
print(f"Starting parallel evaluation of {len(setups)} setups using all CPU cores...")
results = Parallel(n_jobs=-1)(delayed(evaluate_one_setup)(setup) for setup in setups)
for perf, records in results:
    if perf is not None:
        summary_rows.append(perf)
    if records is not None:
        trigger_records.extend(records)


# --- Final Output Generation ---
summary_df=pd.DataFrame(summary_rows)

# --- Basic Setup Lifecycle Tracking Derived Metrics ---
if not summary_df.empty:
    summary_df['first_trigger_date'] = pd.to_datetime(summary_df['first_trigger_date'])
    summary_df['last_trigger_date'] = pd.to_datetime(summary_df['last_trigger_date'])
    summary_df['setup_duration_days'] = (summary_df['last_trigger_date'] - summary_df['first_trigger_date']).dt.days
    summary_df['avg_trigger_frequency_per_day'] = summary_df['support'] / summary_df['setup_duration_days'].replace(0, np.nan)
    summary_df.replace([np.inf, -np.inf], np.nan, inplace=True)
else:
    print("Summary DataFrame is empty, skipping lifecycle metrics calculation.")

top = summary_df[summary_df['sharpe_10d'] != 0.0].sort_values('sharpe_10d',ascending=False).head(20)

summary_df.to_csv('setup_results_summary.csv',index=False)
trigger_df=pd.DataFrame(trigger_records)
trigger_df.to_csv('setup_trigger_log.csv',index=False)
top.to_json('top_setups.json',orient='records',indent=2)

print('\nDiscovery complete with engineered features')
print(top.head())

In [None]:
# Feature Engineering + Discovery Engine with custom features
import pandas as pd
import numpy as np
import random
import warnings
import functools
import os
from scipy.stats import linregress
import itertools

# Import joblib for parallel processing
from joblib import Parallel, delayed

warnings.filterwarnings('ignore')

# --- DEFINITIONS AND CONFIGURATION ---
# Define the explicit list of tradable tickers (full sheet names from your All_Tickers copy.xlsx)
TRADABLE_TICKERS = [
    'QQQ US Equity', 'SPY US Equity', 'XLK US Equity', 'XLF US Equity',
    'XLE US Equity', 'ARKK US Equity', 'VIX Index', 'GLD US Equity',
    'NBIS US Equity', 'LLY US Equity', 'TSLA US Equity', 'AAPL US Equity',
    'NVDA US Equity'
]

# Define the file paths
MAIN_DATA_FILE = 'All_Tickers copy.xlsx'
MACRO_DATA_FILE = 'Macro_tickers_no_nan_cols.xlsx'

# Setup Generation Configuration
# Using iterative search for these lengths (1-feature to 4-feature setups)
SETUP_LENGTHS_TO_EXPLORE = [1, 2, 3, 4]
MIN_INITIAL_SUPPORT_FILTER = 10 # Minimum number of trigger days for a setup to be considered

# Crucial: Controls how many "best" setups from one length are passed to the next iteration.
# LOWER VALUE = FASTER RUNTIME (narrower search).
# HIGHER VALUE = SLOWER RUNTIME (broader search).
N_BEST_TO_PROPAGATE = 20 # Your current test value. Consider 500-2000 for broader search later.

# Option Simulation Configuration
OPTION_SIM_HORIZON_DAYS = 10 # Days to expiration for simulated options
RISK_FREE_RATE = 0.01 # Annual risk-free rate for option premium estimation

# --- END DEFINITIONS AND CONFIGURATION ---


print('Loading raw workbooks …')
raw = None # Initialize raw to None

# --- Custom Data Loading Function to handle sheet names as prefixes ---
def load_and_merge_excel(file_path, existing_df=None):
    """Loads an Excel file, prepends sheet names to columns (except Date), and merges."""
    try:
        xls = pd.ExcelFile(file_path)
        current_df = existing_df.copy() if existing_df is not None else None

        for sh_name in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name=sh_name)
            # Prepend sheet name to all columns except 'Date'
            df.columns = [f"{sh_name}_{col}" if col != 'Date' else col for col in df.columns]

            if current_df is None:
                current_df = df
            else:
                current_df = current_df.merge(df, on='Date', how='outer')
        return current_df
    except FileNotFoundError:
        print(f"Error: '{file_path}' not found. Please ensure the file is in the correct directory.")
        return existing_df
    except Exception as e:
        print(f"An unexpected error occurred during Excel loading of '{file_path}': {e}")
        return existing_df

# Load main data
raw = load_and_merge_excel(MAIN_DATA_FILE)

# Load macro data if main data was loaded successfully
if raw is not None and not raw.empty:
    raw = load_and_merge_excel(MACRO_DATA_FILE, existing_df=raw)
else:
    print("Main data could not be loaded, skipping macro data loading.")
    raw = pd.DataFrame({'Date': pd.to_datetime([])})

# Final cleaning and sorting
if raw is not None and not raw.empty:
    raw = raw.sort_values('Date').reset_index(drop=True)
    raw.fillna(method='ffill', inplace=True)

    # --- FIX: Handle duplicate dates and set 'Date' as index ---
    if 'Date' in raw.columns:
        raw['Date'] = pd.to_datetime(raw['Date'])
        raw = raw.drop_duplicates(subset=['Date'], keep='last')
        raw.set_index('Date', inplace=True)
        raw.index = pd.to_datetime(raw.index)
        raw.sort_index(inplace=True)
    else:
        print("Warning: 'Date' column not found to set as index. Some lookups might be less efficient or prone to error.")
    # --- END FIX ---

else:
    print("No data loaded. Raw DataFrame is empty.")
    raw = pd.DataFrame({'Date': pd.to_datetime([])})

print('Raw shape:', raw.shape)
print('Example columns after loading:')
print(raw.columns[:5].tolist())


# --- Dynamic Ticker Identification Refined ---
all_column_prefixes = sorted(list(set([c.split('_')[0] for c in raw.columns if '_' in c])))
COMMON_FEATURE_PREFIXES = ['Last', 'Open', 'High', 'Low', 'VWAP', 'Volume', 'IVOL', 'Implied', 'Total',
                           '30', '10', '60', 'Hist.', '1st', 'Put', 'Dates', 'CHG', 'FFA', 'INJCJC',
                           'NFP', 'JOBS', 'CPI', 'CTII10', 'LF94TRUU', 'SPX', 'USSW10', 'MLCX3CRT',
                           'FARBAST', 'BSPGCPUS', 'SPCSUSA', 'SPCS20SM', 'CONSSENT']
actual_ticker_prefixes = [p for p in all_column_prefixes if p not in COMMON_FEATURE_PREFIXES]
all_tickers = sorted(list(set(TRADABLE_TICKERS + actual_ticker_prefixes)))

print(f'\nIdentified all relevant prefixes/tickers for feature engineering: {all_tickers}')
print(f'Actual tradable tickers for returns: {TRADABLE_TICKERS}')


# --- Revised Helper functions (using raw.index as source) ---
# Removed @memory.cache
def first_col_containing(ticker_full_name, substr=''):
    """
    Finds the first column name in raw that matches the pattern 'ticker_full_name_substr'.
    """
    if substr == 'PX_LAST':
        potential_col_name_long_price = f"{ticker_full_name}_Last_Price_PX_LAST"
        if potential_col_name_long_price in raw.columns:
            return potential_col_name_long_price

        potential_col_name_short_px = f"{ticker_full_name}_PX_LAST"
        if potential_col_name_short_px in raw.columns:
            return potential_col_name_short_px

    for c in raw.columns:
        if c.startswith(ticker_full_name) and substr in c:
            return c
    return None

def safe_series(col_name):
    """Returns a column as a Series, or an empty Series if column does not exist."""
    return raw[col_name] if col_name and col_name in raw.columns else pd.Series(index=raw.index, dtype=float)

# --- Helper function for Block Bootstrap Sharpe ---
# Adjusted num_iterations for faster testing (DIAGNOSTIC VALUE)
def block_bootstrap_sharpe(returns_series, block_size, num_iterations=10, annualize=True, trading_days_per_year=252):
    """
    Calculates the Sharpe Ratio using block bootstrapping to account for serial correlation.
    """
    returns_series = returns_series.dropna()
    if len(returns_series) < block_size or len(returns_series) < 2:
        return 0.0, 0.0, 0.0

    sharpes = []
    blocks = []
    for i in range(0, len(returns_series), block_size):
        block = returns_series.iloc[i : i + block_size]
        if not block.empty:
            blocks.append(block)

    if not blocks:
        return 0.0, 0.0, 0.0

    # FIX: Corrected n_blocks_to_sample calculation
    n_blocks_to_sample = int(np.ceil(len(returns_series) / block_size))

    for _ in range(num_iterations):
        resampled_returns_list = []
        if not blocks: # Defensive check
            resampled_returns = pd.Series(dtype=float)
        else:
            sampled_blocks_indices = np.random.choice(len(blocks), n_blocks_to_sample, replace=True)
            for idx in sampled_blocks_indices:
                resampled_returns_list.append(blocks[idx])
            resampled_returns = pd.concat(resampled_returns_list).iloc[:len(returns_series)] # Trim to original length


        if resampled_returns.std() > 1e-9:
            daily_sharpe = resampled_returns.mean() / resampled_returns.std()
            if annualize:
                sharpes.append(daily_sharpe * np.sqrt(trading_days_per_year))
            else:
                sharpes.append(daily_sharpe)
        else:
            sharpes.append(0.0)

    sharpes_sorted = sorted(sharpes)
    if not sharpes_sorted:
        return 0.0, 0.0, 0.0

    median_sharpe = np.median(sharpes_sorted)
    lower_ci = sharpes_sorted[int(0.05 * num_iterations)]
    upper_ci = sharpes_sorted[int(0.95 * num_iterations)]

    return median_sharpe, lower_ci, upper_ci


# --- Option Payoff Simulation Helpers ---
def estimate_atm_premium(S, ivol, T_days, r_annual=RISK_FREE_RATE, option_type='call'):
    """Estimates ATM option premium using a simplified Black-Scholes like approach."""
    if S <= 0 or ivol <= 0 or T_days <= 0:
        return 0.001
    T_years = T_days / 252.0
    premium_estimate = 0.4 * S * ivol * np.sqrt(T_years)
    return max(premium_estimate, 0.001)

def simulate_option_pnl(current_price, future_price, ivol_at_entry, horizon_days, entry_direction):
    """Simulates PnL for buying a simple ATM call or put."""
    if pd.isna(current_price) or pd.isna(future_price) or pd.isna(ivol_at_entry):
        return np.nan
    strike = current_price
    if entry_direction == 'long':
        premium = estimate_atm_premium(current_price, ivol_at_entry, horizon_days, option_type='call')
        payoff = max(future_price - strike, 0)
        pnl = payoff - premium
    elif entry_direction == 'short':
        premium = estimate_atm_premium(current_price, ivol_at_entry, horizon_days, option_type='put')
        payoff = max(strike - future_price, 0)
        pnl = payoff - premium
    else:
        pnl = np.nan
    return pnl


# --- Fractional Differencing Helper ---
def frac_diff(series, d=0.5, window='full'):
    """Computes fractionally differenced series."""
    if not isinstance(series, pd.Series):
        series = pd.Series(series)
    weights = [1.]
    for k in range(1, series.shape[0]):
        weights.append(-weights[-1] * (d - k + 1) / k)
    weights = np.array(weights[::-1])
    output = pd.Series(index=series.index, dtype=float)
    for i in range(series.shape[0]):
        if window == 'full':
            start = 0
        else:
            start = max(0, i - window + 1)
        subset = series.iloc[start : i + 1]
        current_weights = weights[-(i - start + 1):]
        if len(subset) == len(current_weights):
            output.iloc[i] = np.dot(current_weights, subset)
    return output.dropna()


# --- Feature Engineering ---
print('\nEngineering custom features …')
feat = pd.DataFrame(index=raw.index)

# 1. VOLATILITY-BASED FEATURES
print("  - Volatility-based features")
for ticker in TRADABLE_TICKERS:
    ivol_10d_col = first_col_containing(ticker, '10_Day_Call_Implied_Volatility_CALL_IMP_VOL_10D')
    ivol_30d_col = first_col_containing(ticker, '30_Day_Call_Implied_Volatility_CALL_IMP_VOL_30D')
    ivol_60d_col = first_col_containing(ticker, '60_Day_Call_Implied_Volatility_CALL_IMP_VOL_60D')
    if ivol_60d_col and ivol_10d_col:
        feat[f'{ticker}_IVOL_Term_Structure_Slope'] = safe_series(ivol_60d_col) - safe_series(ivol_10d_col)
    call_40d_ivol_col = first_col_containing(ticker, '1st_Month_Call_Imp_Vol_40_Delta_LIVE_1M_CALL_IMP_VOL_40DELTA_DFLT')
    put_50d_ivol_col = first_col_containing(ticker, '1st_Month_Put_Imp_Vol_50_Delta_LIVE_1M_PUT_IMP_VOL_50DELTA_DFLT')
    if call_40d_ivol_col and put_50d_ivol_col:
        feat[f'{ticker}_IVOL_Skew_Approx'] = safe_series(put_50d_ivol_col) - safe_series(call_40d_ivol_col)
    specific_ivol_suffixes = ['IVOL_SIGMA', 'IVOL_DELTA', 'IVOL_MONEYNESS', 'CALL_IMP_VOL_30D', 'CALL_IMP_VOL_10D', 'CALL_IMP_VOL_60D', 'PUT_IMP_VOL_30D', 'PUT_IMP_VOL_10D', 'PUT_IMP_VOL_60D']
    for ivol_suffix in specific_ivol_suffixes:
        col_name = first_col_containing(ticker, ivol_suffix)
        if col_name:
            if '60_Day_Call_Implied_Volatility_CALL_IMP_VOL_60D' in col_name and \
               first_col_containing(ticker, '10_Day_Call_Implied_Volatility_CALL_IMP_VOL_10D'):
                feat[f'{ticker}_IVOL_Call_Slope'] = safe_series(col_name) - \
                                                    safe_series(first_col_containing(ticker, '10_Day_Call_Implied_Volatility_CALL_IMP_VOL_10D'))
            diff = safe_series(col_name).diff()
            z = (diff - diff.rolling(30).mean()) / diff.rolling(30).std()
            feat[col_name+'_shock'] = (z > 2).astype(int)
            vol_col_name = first_col_containing(ticker, 'VOLUME')
            if vol_col_name:
                feat[col_name+'_div'] = safe_series(col_name) / safe_series(vol_col_name)
            else:
                feat[col_name+'_div'] = np.nan

# 2. DERIV FLOW & SENTIMENT
print("  - Deriv Flow & Sentiment features")
for ticker in all_tickers:
    pc_col = first_col_containing(ticker, 'PUT_CALL_VOLUME_RATIO_CUR_DAY')
    if pc_col:
        feat[pc_col+'_ema5']=safe_series(pc_col).ewm(span=5,adjust=False).mean()
    open_int_col = first_col_containing(ticker, 'OPEN_INT_TOTAL_CALL')
    if open_int_col:
        feat[open_int_col+'_chg3']=safe_series(open_int_col).pct_change(3)
    volm_col = first_col_containing(ticker, 'Volume_-_Realtime_VOLUME')
    if volm_col:
        feat[volm_col+'_z']=(safe_series(volm_col)-safe_series(volm_col).rolling(30).mean())/safe_series(volm_col).rolling(30).std()
for ticker in all_tickers:
    open_int_t_col = first_col_containing(ticker, 'OPEN_INT_TOTAL_CALL')
    ivol_10d_col = first_col_containing(ticker, '10_Day_Call_Implied_Volatility_CALL_IMP_VOL_10D')
    if open_int_t_col and ivol_10d_col:
        feat[f'{ticker}_smart_money_flag'] = ((safe_series(open_int_t_col).pct_change() > 0) & \
                                              (safe_series(ivol_10d_col).pct_change() > 0)).astype(int)

# 3. CROSS-ASSET CORRELATIONS
print("  - Cross-Asset Correlations")
correlation_universe = sorted(list(set(TRADABLE_TICKERS + [t for t in all_tickers if t in ['DXY Curncy', 'USGG10YR Index', 'SPX Index', 'CO1 Comdty', 'USGG2YR Index']])))
dynamic_pairs = []
for i in range(len(TRADABLE_TICKERS)):
    for j in range(i + 1, len(TRADABLE_TICKERS)):
        dynamic_pairs.append((TRADABLE_TICKERS[i], TRADABLE_TICKERS[j]))
dynamic_pairs.extend([
    ('SPY US Equity', 'VIX Index'),
    ('SPY US Equity', 'USGG10YR Index'),
    ('SPY US Equity', 'DXY Curncy'),
    ('SPY US Equity', 'CO1 Comdty'),
    ('SPY US Equity', 'USGG2YR Index'),
    ('VIX Index', 'USGG10YR Index'),
    ('VIX Index', 'DXY Curncy'),
    ('VIX Index', 'CO1 Comdty'),
])
dynamic_pairs = list(set(dynamic_pairs))
for t1,t2 in dynamic_pairs:
    p1=first_col_containing(t1,'PX_LAST'); p2=first_col_containing(t2,'PX_LAST')
    if p1 and p2:
        s1=safe_series(p1); s2=safe_series(p2)
        if not s1.empty and not s2.empty:
            aligned_data = pd.DataFrame({'s1': s1, 's2': s2}).dropna()
            if len(aligned_data) > 60:
                c20=aligned_data['s1'].rolling(20).corr(aligned_data['s2'])
                c60=aligned_data['s1'].rolling(60).corr(aligned_data['s2'])
                feat[f'{t1}_{t2}_c20']=c20
                feat[f'{t1}_{t2}_c60']=c60
                feat[f'{t1}_{t2}_cZ']=(c20-c20.rolling(60).mean())/c20.rolling(60).std()
                feat[f'{t1}_{t2}_cDelta']=c20-c60
                ret1 = s1.pct_change().dropna()
                ret2 = s2.pct_change().dropna()
                aligned_returns = pd.DataFrame({'ret1': ret1, 'ret2': ret2}).dropna()
                if not aligned_returns.empty and aligned_returns['ret2'].var() != 0:
                    rolling_beta = aligned_returns['ret1'].rolling(window=60).cov(aligned_returns['ret2']) / \
                                   aligned_returns['ret2'].rolling(window=60).var()
                    feat[f'{t1}_{t2}_rolling_beta'] = rolling_beta

# 4. MACRO TRIGGERS
print("  - Macro Trigger features")
dxy_px=first_col_containing('DXY Curncy','PX_LAST')
ust10_px=first_col_containing('USGG10YR Index','PX_LAST')
spy_px=first_col_containing('SPY US Equity','PX_LAST')
vix_px=first_col_containing('VIX Index','PX_LAST')
feat['MPI']=(safe_series(dxy_px).pct_change().rolling(3).sum()+safe_series(ust10_px).pct_change().rolling(3).sum()).shift(0)
feat['VIX_gt20']=(safe_series(vix_px)>20).astype(int)
feat['DXY_rising']=(safe_series(dxy_px).pct_change()>0).astype(int)
feat['SPY_below_MA20']=(safe_series(spy_px)<safe_series(spy_px).rolling(20).mean()).astype(int)
feat['fear_overdrive']=((feat['VIX_gt20']==1)&(feat['DXY_rising']==1)&(feat['SPY_below_MA20']==1)).astype(int)
xlk_px=first_col_containing('XLK US Equity','PX_LAST'); xle_px=first_col_containing('XLE US Equity','PX_LAST')
feat['sector_rotation']=safe_series(xlk_px).pct_change(5)-safe_series(xle_px).pct_change(5)

# --- NEW MACRO FEATURES ---
ust2_px = first_col_containing('USGG2YR Index', 'PX_LAST')
if ust10_px and ust2_px:
    feat['UST10Y_2Y_Spread'] = safe_series(ust10_px) - safe_series(ust2_px)
    feat['UST10Y_2Y_Spread_chg'] = feat['UST10Y_2Y_Spread'].pct_change()
cpi_yoy_px = first_col_containing('CPI YOY Index', 'PX_LAST')
cpi_chng_px = first_col_containing('CPI CHNG Index', 'PX_LAST')
if cpi_yoy_px:
    feat['CPI_YOY_mom3'] = safe_series(cpi_yoy_px).pct_change(3)
    feat['CPI_YOY_z'] = (safe_series(cpi_yoy_px) - safe_series(cpi_yoy_px).rolling(12).mean()) / safe_series(cpi_yoy_px).rolling(12).std()
if cpi_chng_px:
    feat['CPI_CHNG_mom3'] = safe_series(cpi_chng_px).pct_change(3)
injcjc_px = first_col_containing('INJCJC Index', 'PX_LAST')
nfp_tch_px = first_col_containing('NFP TCH Index', 'PX_LAST')
jobs_us_equity_px = first_col_containing('JOBS US Equity', 'PX_LAST')
if injcjc_px:
    feat['INJCJC_shock'] = (safe_series(injcjc_px).diff() > safe_series(injcjc_px).diff().rolling(20).std() * 2).astype(int)
if nfp_tch_px:
    feat['NFP_TCH_mom3'] = safe_series(nfp_tch_px).pct_change(3)
if jobs_us_equity_px:
    feat['JOBS_US_Equity_mom3'] = safe_series(jobs_us_equity_px).pct_change(3)
ffa_comdty_px = first_col_containing('FFA Comdty', 'PX_LAST')
ctii10_govt_px = first_col_containing('CTII10 Govt', 'PX_LAST')
ussw10_curncy_px = first_col_containing('USSW10 Curncy', 'PX_LAST')
mlcx3crt_index_px = first_col_containing('MLCX3CRT Index', 'PX_LAST')
farbast_index_px = first_col_containing('FARBAST Index', 'PX_LAST')
bspgcpus_index_px = first_col_containing('BSPGCPUS Index', 'PX_LAST')
spcsusa_index_px = first_col_containing('SPCSUSA Index', 'PX_LAST')
spcs20sm_index_px = first_col_containing('SPCS20SM Index', 'PX_LAST')
conssent_index_px = first_col_containing('CONSSENT Index', 'PX_LAST')
lf94truu_index_vol30d = first_col_containing('LF94TRUU Index', 'VOLATILITY_30D')
if ffa_comdty_px: feat['FFA_Spread'] = safe_series(ffa_comdty_px) - safe_series(ust2_px)
if ctii10_govt_px: feat['CTII10_mom'] = safe_series(ctii10_govt_px).pct_change()
if ussw10_curncy_px: feat['USSW10_chg'] = safe_series(ussw10_curncy_px).pct_change()
if mlcx3crt_index_px: feat['MLCX3CRT_chg'] = safe_series(mlcx3crt_index_px).pct_change()
if farbast_index_px: feat['FARBAST_mom'] = safe_series(farbast_index_px).pct_change()
if bspgcpus_index_px: feat['BSPGCPUS_mom'] = safe_series(bspgcpus_index_px).pct_change()
if spcsusa_index_px: feat['SPCSUSA_mom'] = safe_series(spcsusa_index_px).pct_change()
if spcs20sm_index_px: feat['SPCS20SM_mom'] = safe_series(spcs20sm_index_px).pct_change()
if conssent_index_px: feat['CONSSENT_mom'] = safe_series(conssent_index_px).pct_change()
if lf94truu_index_vol30d: feat['LF94TRUU_Vol_Signal'] = safe_series(lf94truu_index_vol30d) / safe_series(lf94truu_index_vol30d).rolling(60).mean()


# 5. MOMENTUM / VOL FRACTALS
print("  - Momentum/Vol Fractals")
for tk in all_tickers:
    px_col=first_col_containing(tk,'PX_LAST')
    if not px_col: continue
    px=safe_series(px_col)
    mom5=px.pct_change(5)
    vol20=px.pct_change().rolling(20).std()
    feat[tk+'_mom5_vol20']=mom5/vol20
    ma=px.rolling(20).mean(); std=px.rolling(20).std()
    feat[tk+'_pctB']=(px-(ma-2*std))/(4*std)
    if not px.empty and len(px) > 100:
        try:
            feat[f'{tk}_frac_diff_0_5'] = frac_diff(px, d=0.5, window=100)
        except Exception as e:
            print(f"Warning: Could not compute fractional differencing for {tk}: {e}")
            feat[f'{tk}_frac_diff_0_5'] = np.nan


# 6. Shift +1 day and merge basic returns
feat=feat.shift(1)
panel = feat.copy()

# Prepare returns for evaluation based *only* on TRADABLE_TICKERS
price_cols_for_returns = []
for ticker_full_name in TRADABLE_TICKERS:
    px_col_name = first_col_containing(ticker_full_name, 'PX_LAST')
    if px_col_name:
        price_cols_for_returns.append(px_col_name)
    else:
        print(f"Warning: PX_LAST column not found for tradable ticker '{ticker_full_name}'. It will be excluded from return calculations.")

prices=raw[price_cols_for_returns].copy()
returns={h:prices.pct_change(h).shift(-h) for h in [1,3,5,10,21]}


# Merge tradable returns into panel
for tk_full_name in TRADABLE_TICKERS:
    px_col_name = first_col_containing(tk_full_name, 'PX_LAST')
    if px_col_name and px_col_name in raw.columns:
        ret_series = raw[px_col_name].pct_change().shift(1).rename(f'{tk_full_name}_ret1')
        panel = pd.concat([panel, ret_series], axis=1)

print('Engineered panel shape:',panel.shape)


# --- Primitive Signal Generation ---
print('\nBuilding primitive signals from engineered panel …')
signals={}
# Helper functions for inferring signal direction and type (defined once globally)
def get_primitive_direction(primitive_name):
    """Infers the directional bias (+1 for long, -1 for short, 0 for neutral) of a primitive signal."""
    if '>80' in primitive_name or 'z>1.5' in primitive_name or 'ma5>ma20' in primitive_name or 'rising' in primitive_name: return 1
    elif '<20' in primitive_name or 'z<-1.name' in primitive_name or 'ma5<ma20' in primitive_name or 'below_MA' in primitive_name: return -1
    elif 'shock' in primitive_name: return 0
    elif 'slope' in primitive_name: return 0
    elif 'Spread' in primitive_name: return 0
    elif 'Vol_Signal' in primitive_name: return 0
    return 0

def get_primitive_signal_type(primitive_name):
    """Infers the broad category/type of a primitive signal based on its name."""
    if 'IVOL' in primitive_name or 'VIX' in primitive_name or 'vol' in primitive_name or '_shock' in primitive_name: return 'volatility'
    elif 'mom' in primitive_name or 'pctB' in primitive_name or '_chg' in primitive_name or '_ret' in primitive_name or 'rising' in primitive_name: return 'momentum'
    elif '_c20' in primitive_name or '_c60' in primitive_name or '_cZ' in primitive_name or '_cDelta' in primitive_name or '_beta' in primitive_name: return 'correlation'
    elif 'DXY' in primitive_name or 'USGG' in primitive_name or 'MPI' in primitive_name or 'fear_overdrive' in primitive_name or 'CPI' in primitive_name or 'INJCJC' in primitive_name or 'NFP' in primitive_name or 'JOBS' in primitive_name or 'FFA' in primitive_name or 'CTII10' in primitive_name or 'USSW10' in primitive_name or 'MLCX3CRT' in primitive_name or 'FARBAST' in primitive_name or 'BSPGCPUS' in primitive_name or 'SPCSUSA' in primitive_name or 'SPCS20SM' in primitive_name or 'CONSSENT' in primitive_name or 'LF94TRUU' in primitive_name: return 'macro'
    elif 'PUT_CALL_VOLUME_RATIO' in primitive_name or 'smart_money_flag' in primitive_name or 'Short_Interest_Ratio' in primitive_name: return 'sentiment'
    elif 'VOLUME' in primitive_name: return 'volume'
    elif 'OPEN_INT' in primitive_name: return 'open_interest'
    elif 'frac_diff' in primitive_name: return 'fractional_differencing'
    return 'other'

for col in panel.columns.drop('Date',errors='ignore'):
    s=panel[col]
    if pd.api.types.is_numeric_dtype(s):
        s_clean = s.replace([np.inf, -np.inf], np.nan).dropna()
        if s_clean.empty or s_clean.std() == 0: continue
        rank=s_clean.rank(pct=True)
        signals[col+'>80']=rank>0.8
        signals[col+'<20']=rank<0.2
        rolling_std_60 = s_clean.rolling(60).std()
        valid_std_mask = rolling_std_60 > 1e-9
        z = pd.Series(np.nan, index=s_clean.index, dtype=float)
        z[valid_std_mask] = (s_clean - s_clean.rolling(60).mean())[valid_std_mask] / rolling_std_60[valid_std_mask]
        signals[col+'_z>1.5']=z>1.5
        signals[col+'_z<-1.5']=z<-1.5
        ma5=s_clean.rolling(5).mean(); ma20=s_clean.rolling(20).mean()
        signals[col+'_ma5>ma20']=ma5>ma20
print('Total primitive signals:',len(signals))


# --- START OF NEW ADDITION: ITERATIVE/GREEDY SETUP GENERATION (Replaces old Primitive Pre-filtering and Setup Generation blocks) ---
print(f"\nStarting iterative/greedy setup generation for lengths {SETUP_LENGTHS_TO_EXPLORE}…")

# Define the full list of primitive names to draw from (all 4065 signals)
primitive_names_full_list = list(signals.keys())

# Define how many "best" setups to propagate from one length to the next
N_BEST_TO_PROPAGATE = 20 # Your current test value.

# Store the final summary rows and trigger records from all valid setups
final_summary_rows = []
final_trigger_records = []

# Initialize best_setups_by_length with a placeholder for length 0
best_setups_by_length = {0: [{"feature_conditions": [], "setup_id": "S0000_Empty"}]} # Align keys
# Track already evaluated combinations to avoid redundant work
seen_combinations = set()

# Iterative loop to build setups of increasing length
for current_k in SETUP_LENGTHS_TO_EXPLORE:
    print(f"  Generating and evaluating {current_k}-feature setups...")
    current_length_candidate_setups = [] # Candidates generated in this iteration before filtering

    prev_length_best_setups = best_setups_by_length.get(current_k - 1, [])

    if not prev_length_best_setups and current_k > 1:
        print(f"    No promising setups from length {current_k - 1} to build upon. Skipping length {current_k}.")
        break

    for prev_setup_data in prev_length_best_setups:
        prev_conds = prev_setup_data["feature_conditions"] # Align key

        # Determine the starting index for adding new primitives to maintain order and avoid permutations
        start_idx = 0
        if prev_conds:
            # Ensure the last primitive from prev_conds is in the full list before finding its index
            if prev_conds[-1] in primitive_names_full_list:
                last_prev_cond_idx = primitive_names_full_list.index(prev_conds[-1])
                start_idx = last_prev_cond_idx + 1
            else:
                # If a prev_cond somehow isn't in the full list, start from beginning (shouldn't happen with correct flow)
                start_idx = 0

        # Iterate through the full list of primitive names to find new conditions to add
        for i in range(start_idx, len(primitive_names_full_list)):
            new_primitive_name = primitive_names_full_list[i]

            new_conds = sorted(prev_conds + [new_primitive_name])

            new_conds_frozenset = frozenset(new_conds)
            if new_conds_frozenset in seen_combinations:
                continue

            seen_combinations.add(new_conds_frozenset)

            valid_combo_signals = [signals[c] for c in new_conds if c in signals]
            if not valid_combo_signals:
                continue

            try:
                current_mask = functools.reduce(lambda a,b: a & b, valid_combo_signals)
                current_support = current_mask.sum()
            except KeyError:
                current_support = 0

            if current_support >= MIN_INITIAL_SUPPORT_FILTER:
                temp_setup = {
                    'id': 'TEMP_S' + str(len(current_length_candidate_setups) + 1).zfill(4), # Temporary ID
                    'conds': new_conds,
                    'setup_length': current_k,
                    'setup_type': 'concurrent',
                    'support': current_support
                }
                current_length_candidate_setups.append(temp_setup)

    print(f"    Evaluating {len(current_length_candidate_setups)} candidate setups for length {current_k}...")

# --- Parallelize the evaluation of current_length_candidate_setups ---
    # Adding verbose=10 to see joblib's progress
    evaluated_results_for_current_k = Parallel(n_jobs=-1, verbose=10)(delayed(evaluate_one_setup)(s) for s in current_length_candidate_setups)

    current_summary_rows = []
    current_trigger_records_batch = []

    for perf, records in evaluated_results_for_current_k:
        if perf is not None:
            current_summary_rows.append(perf)
            if records is not None:
                current_trigger_records_batch.extend(records)

    final_trigger_records.extend(current_trigger_records_batch)

    # Select the best setups for propagation to the next length
    if current_summary_rows:
        current_length_summary_df = pd.DataFrame(current_summary_rows)
        # Sort by Sharpe and select top N to carry forward
        best_for_next_length_df = current_length_summary_df.sort_values('sharpe_10d', ascending=False).head(N_BEST_TO_PROPAGATE)

        best_setups_by_length[current_k] = best_for_next_length_df[['setup_id', 'feature_conditions']].to_dict(orient='records')

        final_summary_rows.extend(current_summary_rows)

        print(f"    Selected {len(best_setups_by_length[current_k])} best setups to propagate to length {current_k + 1}.")
    else:
        best_setups_by_length[current_k] = []
        print(f"    No valid setups found for length {current_k}. Stopping iteration.")
        break

# --- End Iterative Generation Loop ---

# Assign final, consecutive IDs to all collected setups
setup_id_counter = 1
for s in final_summary_rows:
    s['id'] = 'S' + str(setup_id_counter).zfill(4)
    setup_id_counter += 1

# `setups` variable is now defined from the consolidated list
setups = final_summary_rows

print(f'\nDiscovery complete via iterative search. Total {len(setups)} candidate setups evaluated across all lengths.')


# --- Parallel Setup Evaluation Function (defined ONCE globally) ---
# FIX: Modify function signature to accept explicitly passed global data
def evaluate_one_setup(setup, raw_data, returns_data, signals_data, price_cols_for_returns_data):
    """
    Helper function to evaluate a single setup for parallel processing.
    Now explicitly accepts global data as arguments to reduce pickling overhead.
    Returns (perf, trigger_records_for_this_setup)
    """
    summary_row_for_setup = {}
    trigger_records_for_this_setup = [] # Corrected typo: trigger_records_for_this_this_setup -> trigger_records_for_this_setup

    sid=setup['id']; conds=setup['conds']

    valid_signals = [signals_data[c] for c in conds if c in signals_data] # Use signals_data
    if not valid_signals:
        return None, None

    mask=functools.reduce(lambda a,b: a & b, valid_signals)
    dates=mask[mask].index
    support=len(dates)

    if support < MIN_INITIAL_SUPPORT_FILTER:
        return None, None

    direction_score = 0
    signal_type_counts = {}
    for cond in conds:
        direction_score += get_primitive_direction(cond) # get_primitive_direction is global, assumes globals
        s_type = get_primitive_signal_type(cond)       # get_primitive_signal_type is global, assumes globals
        signal_type_counts[s_type] = signal_type_counts.get(s_type, 0) + 1

    entry_direction = 'mixed'
    if direction_score > 0: entry_direction = 'long'
    elif direction_score < 0: entry_direction = 'short'
    dominant_signal_type = 'unknown'
    if signal_type_counts: dominant_signal_type = max(signal_type_counts, key=signal_type_counts.get)

    first_trigger_date = dates.min() if not dates.empty else pd.NaT
    last_trigger_date = dates.max() if not dates.empty else pd.NaT

    perf={'setup_id':sid,'feature_conditions':'+'.join(conds),'support':support,
          'entry_direction': entry_direction,
          'dominant_signal_type': dominant_signal_type,
          'first_trigger_date': first_trigger_date,
          'last_trigger_date': last_trigger_date
         }

    for h,label in zip([3,5,10,21],['accuracy_3d','avg_return_5d','sharpe_10d','hit_rate_21d']):
        r = returns_data[h].reindex(dates) # Use returns_data
        if r.empty:
            perf[label] = 0.0
            continue
        mean=r.mean(axis=1).dropna()
        if mean.empty:
            perf[label] = 0.0
            continue
        dir_correct=(mean>0).mean()
        if 'accuracy' in label: perf[label]=dir_correct
        elif 'avg' in label: perf[label]=mean.mean()
        elif 'sharpe' in label:
            bootstrap_block_size = min(h if h > 1 else 2, max(1, len(mean) // 2))
            if mean.std() > 0.0001 and len(mean) >= 2:
                median_sharpe, _, _ = block_bootstrap_sharpe(mean, block_size=bootstrap_block_size, num_iterations=10)
                perf[label] = median_sharpe
            else: perf[label] = 0.0
        elif 'hit' in label: perf[label]=(r>0).stack().mean()
    summary_row_for_setup = perf

    # OPTIMIZATION: Pre-fetch all necessary raw data for all tickers and trigger dates
    # Instead of repeated individual lookups in the inner loop
    unique_tickers_for_data_slice = []
    for tk_full_name in price_cols_for_returns_data:
        unique_tickers_for_data_slice.append(tk_full_name)
        # Also need IVOL columns for option simulation
        tk_symbol_for_log_temp = tk_full_name.split('_PX_LAST')[0].split('_Last_Price')[0]
        ivol_col_for_sim_temp = first_col_containing(tk_symbol_for_log_temp, f'{OPTION_SIM_HORIZON_DAYS}_Day_Call_Implied_Volatility_CALL_IMP_VOL_{OPTION_SIM_HORIZON_DAYS}D')
        if not ivol_col_for_sim_temp: ivol_col_for_sim_temp = first_col_containing(tk_symbol_for_log_temp, '30_Day_Call_Implied_Volatility_CALL_IMP_VOL_30D')
        if not ivol_col_for_sim_temp: ivol_col_for_sim_temp = first_col_containing(tk_symbol_for_log_temp, '60_Day_Call_Implied_Volatility_CALL_IMP_VOL_60D')
        if ivol_col_for_sim_temp and ivol_col_for_sim_temp not in unique_tickers_for_data_slice:
            unique_tickers_for_data_slice.append(ivol_col_for_sim_temp)

    # Get a slice of raw_data containing all needed columns for all dates a strategy triggers
    # This is a key optimization: fetch once for all dates in `dates`
    raw_data_subset_for_setup = raw_data.reindex(dates)[unique_tickers_for_data_slice]

    # Pre-calculate future prices for all trigger dates and relevant tickers in one go
    future_target_dates_series = dates + pd.Timedelta(days=OPTION_SIM_HORIZON_DAYS)
    # Reindex the raw_data_subset to future dates
    future_raw_data_subset = raw_data.reindex(future_target_dates_series, method='nearest', tolerance=pd.Timedelta(days=5))[unique_tickers_for_data_slice]


    for d_idx, d in enumerate(dates):
        current_data_on_d = raw_data_subset_for_setup.loc[d] # Get current data for this trigger date
        future_data_on_target_date = future_raw_data_subset.loc[future_target_dates_series[d_idx]] # Get future data for this trigger date's target

        for tk_col_full_name in price_cols_for_returns_data: # Use price_cols_for_returns_data
            tk_symbol_for_log = tk_col_full_name.split('_PX_LAST')[0].split('_Last_Price')[0]

            current_px = current_data_on_d.get(tk_col_full_name, np.nan)

            ivol_col_for_sim = first_col_containing(tk_symbol_for_log, f'{OPTION_SIM_HORIZON_DAYS}_Day_Call_Implied_Volatility_CALL_IMP_VOL_{OPTION_SIM_HORIZON_DAYS}D')
            if not ivol_col_for_sim: ivol_col_for_sim = first_col_containing(tk_symbol_for_log, '30_Day_Call_Implied_Volatility_CALL_IMP_VOL_30D')
            if not ivol_col_for_sim: ivol_col_for_sim = first_col_containing(tk_symbol_for_log, '60_Day_Call_Implied_Volatility_CALL_IMP_VOL_60D')
            ivol_at_entry = current_data_on_d.get(ivol_col_for_sim, np.nan) if ivol_col_for_sim else np.nan

            option_pnl_10d = np.nan
            if not pd.isna(current_px) and not pd.isna(ivol_at_entry) and current_px > 0 and ivol_at_entry > 0:
                future_px_for_sim = future_data_on_target_date.get(tk_col_full_name, np.nan)

                if not pd.isna(future_px_for_sim):
                    option_pnl_10d = simulate_option_pnl(current_px, future_px_for_sim, ivol_at_entry, OPTION_SIM_HORIZON_DAYS, entry_direction)

            # Access returns from returns_data (the explicitly passed global returns dict)
            # This is already optimized via r = returns_data[h].reindex(dates) earlier
            ret_vals={h:returns_data[h][tk_col_full_name].get(d, np.nan) if tk_col_full_name in returns_data[h].columns else np.nan for h in [1,3,5,10,21]}

            trigger_records_for_this_setup.append({'date':d,'ticker':tk_symbol_for_log,'setup_id':sid,'matched':1,
                                    'return_1d':ret_vals[1],'return_3d':ret_vals[3],'return_5d':ret_vals[5],
                                    'return_10d':ret_vals[10],'return_21d':ret_vals[21],
                                    'option_pnl_10d': option_pnl_10d})
    return summary_row_for_setup, trigger_records_for_this_setup


# --- Final Output Generation (using the collected `final_summary_rows` and `final_trigger_records`) ---
summary_df = pd.DataFrame(setups) # Use the collected and re-ID'd setups

# --- Basic Setup Lifecycle Tracking Derived Metrics ---
if not summary_df.empty:
    summary_df['first_trigger_date'] = pd.to_datetime(summary_df['first_trigger_date'])
    summary_df['last_trigger_date'] = pd.to_datetime(summary_df['last_trigger_date'])
    summary_df['setup_duration_days'] = (summary_df['last_trigger_date'] - summary_df['first_trigger_date']).dt.days
    summary_df['avg_trigger_frequency_per_day'] = summary_df['support'] / summary_df['setup_duration_days'].replace(0, np.nan)
    summary_df.replace([np.inf, -np.inf], np.nan, inplace=True)
else:
    print("Summary DataFrame is empty, skipping lifecycle metrics calculation.")

top = summary_df[summary_df['sharpe_10d'] != 0.0].sort_values('sharpe_10d',ascending=False).head(20)

summary_df.to_csv('setup_results_summary.csv',index=False)
trigger_df = pd.DataFrame(final_trigger_records)
trigger_df.to_csv('setup_trigger_log.csv',index=False)
top.to_json('top_setups.json',orient='records',indent=2)

print('\nDiscovery complete with engineered features')
print(top.head())