In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import multiprocessing
from scipy.stats import wilcoxon

# PyPortfolioOpt
from pypfopt.efficient_frontier import EfficientFrontier
from pypfopt import risk_models
from pypfopt.black_litterman import BlackLittermanModel

# For the SJM (Sparse Jump Model)
from jumpmodels.sparse_jump import SparseJumpModel
from jumpmodels.preprocess import StandardScalerPD, DataClipperStd

ASSETS = ["Value", "Growth", "LowVol", "Size", "Momentum", "Quality"]
N_ASSETS = len(ASSETS)

# --- 1. Data simulation (2-state only) ---
def simulate_2state_data(num_days, seed=None):
    """
    Simulates a returns DataFrame with shape (num_days, N_ASSETS),
    plus a separate matrix of hidden states. Returns (DataFrame, states).
    """
    np_rng = np.random.default_rng(seed)
    transmat = np.array([
        [0.9976, 0.0024],
        [0.0232, 0.9768]
    ])
    mu_dict  = {0: 0.0006,   1: -0.000881}
    sig_dict = {0: 0.00757,  1: 0.0163}
    base_corr = np.full((N_ASSETS, N_ASSETS), 0.185)
    np.fill_diagonal(base_corr, 1.0)

    # Edge check
    if num_days < 1:
        raise ValueError("num_days must be >= 1.")

    # Simulate each asset's hidden states
    all_states = np.zeros((num_days, N_ASSETS), dtype=int)
    for i in range(N_ASSETS):
        s = np.zeros(num_days, dtype=int)
        s[0] = np_rng.integers(2)
        for t in range(1, num_days):
            s[t] = np_rng.choice(2, p=transmat[s[t - 1]])
        all_states[:, i] = s

    # Generate returns
    rets = np.zeros((num_days, N_ASSETS))
    for t in range(num_days):
        mu_vec  = np.zeros(N_ASSETS)
        sig_vec = np.zeros(N_ASSETS)
        for i in range(N_ASSETS):
            curr_state = all_states[t, i]
            mu_vec[i]  = mu_dict[curr_state]
            sig_vec[i] = sig_dict[curr_state]
        cov_t = np.outer(sig_vec, sig_vec) * base_corr
        rets[t] = np_rng.multivariate_normal(mean=mu_vec, cov=cov_t)

    return pd.DataFrame(rets, columns=ASSETS), all_states


# --- 2. SJM functions ---
def compute_sjm_features(factor_ser: pd.Series) -> pd.DataFrame:
    """
    Create SJM features (downside dev, differences of dev, etc.).
    """
    def ewm_downside_dev(returns: pd.Series, halflife: float) -> pd.Series:
        negative_returns = returns.clip(upper=0)
        neg_sq = negative_returns ** 2
        ewm_mean = neg_sq.ewm(halflife=halflife, adjust=False).mean()
        return np.sqrt(ewm_mean)

    dd20 = ewm_downside_dev(factor_ser, halflife=20)
    dd60 = ewm_downside_dev(factor_ser, halflife=60)
    dd120 = ewm_downside_dev(factor_ser, halflife=120)

    feats = {
        "DD_hl20": dd20,
        "DD20_minus_DD60": dd20 - dd60,
        "DD60_minus_DD120": dd60 - dd120,
        "Return_hl120": factor_ser.ewm(halflife=120, adjust=False).mean(),
    }
    df_feats = pd.DataFrame(feats).replace([np.inf, -np.inf], np.nan).fillna(0.0)
    return df_feats

def train_sjm_single_asset(series, n_components=2, max_feats=4, lam=50, random_state=42):
    """
    Trains a SparseJumpModel on one asset's returns and returns
    the fitted model plus the transformations.
    """
    # Quick check to ensure enough data
    if len(series) < 2:
        raise ValueError("Not enough data points to train the SJM on a single asset.")

    feats_df = compute_sjm_features(series)
    clipper = DataClipperStd(mul=3.0)
    scaler  = StandardScalerPD()
    X_clipped = clipper.fit_transform(feats_df)
    X_scaled  = scaler.fit_transform(X_clipped)
    X_arr = X_scaled.values

    sjm = SparseJumpModel(
        n_components=n_components,
        max_feats=max_feats,
        jump_penalty=lam,
        cont=False,
        max_iter=20,
        random_state=random_state
    )
    sjm.fit(X_arr)
    return sjm, clipper, scaler

def get_regime_means_stds_single_asset(asset_series, regime_assignments):
    """
    Computes the mean and std for each regime in 'regime_assignments'.
    """
    unique_states = np.unique(regime_assignments)
    regime_means = {}
    regime_stds  = {}
    for s in unique_states:
        data_in_s = asset_series[regime_assignments == s]
        if len(data_in_s) > 0:
            regime_means[s] = data_in_s.mean()
            regime_stds[s]  = data_in_s.std()
        else:
            # Fallback if no points in that regime
            regime_means[s] = asset_series.mean()
            regime_stds[s]  = asset_series.std()
    return regime_means, regime_stds


# --- 3. Black-Litterman helper functions ---
def build_equal_unconditional_prior(df_train):
    """
    Returns a simple unconditional prior vector and covariance matrix
    based on the single-state simulation assumptions.
    """
    SIM_MEAN_1STATE = 0.000461
    SIM_SIG_1STATE  = 0.008388
    TRUE_CORR       = 0.185

    n_assets = df_train.shape[1]
    uniform_corr = np.full((n_assets, n_assets), TRUE_CORR)
    np.fill_diagonal(uniform_corr, 1.0)
    cov_flat = (SIM_SIG_1STATE**2) * uniform_corr

    assets = df_train.columns
    pi_series = pd.Series(np.full(n_assets, SIM_MEAN_1STATE), index=assets)
    cov_df = pd.DataFrame(cov_flat, index=assets, columns=assets)
    return pi_series, cov_df

def get_rolling_cov(full_returns, current_index, halflife=126):
    """
    Returns an exponentially weighted covariance matrix up to row 'current_index'.
    """
    sub_df = full_returns.iloc[:current_index]
    if len(sub_df) < 2:
        # Fallback to identity if too little data
        n_ = sub_df.shape[1]
        return np.eye(n_)
    span_equiv = (2 / (1 - np.exp(-np.log(2) / halflife))) - 1
    cov_est = risk_models.exp_cov(sub_df, span=span_equiv, returns_data=True)
    return cov_est

def regime_based_bl_backtest_flatprior(
    df_train,
    df_test,
    states_test,
    init_state,
    regime_means_list,
    transaction_cost=0.0007,
    risk_free_rate=0.02/252,
    bl_tau=0.05,
    halflife=126
):
    """
    A BL backtest that rebalances only when regime changes.
    """
    T_test = len(df_test)
    assets = df_test.columns
    n_assets = len(assets)
    combined_df = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)

    # Build an unconditional prior (not used heavily except for structure)
    flat_pi, _ = build_equal_unconditional_prior(df_train)

    portfolio_vals = np.zeros(T_test)
    if T_test == 0:
        # If no test data, just return empty arrays
        return portfolio_vals, np.zeros((0, n_assets)), []

    portfolio_vals[0] = 1.0
    weight_history = np.zeros((T_test, n_assets))

    # Start with equal weights
    w_prev = np.ones(n_assets) / n_assets
    weight_history[0] = w_prev

    fallback_records = []

    for t in range(1, T_test):
        # Growth from previous day
        ret_t_minus_1 = df_test.iloc[t - 1].values
        gross_growth = portfolio_vals[t - 1] * (1.0 + np.dot(w_prev, ret_t_minus_1))

        # Check regime change
        if t == 1:
            current_states = init_state
            do_rebalance = True
        else:
            current_states = states_test[t - 1]
            do_rebalance = np.any(states_test[t - 1] != states_test[t - 2])

        if do_rebalance:
            # Build absolute views from regime_means
            view_vector = np.zeros(n_assets)
            for i in range(n_assets):
                view_vector[i] = regime_means_list[i].get(current_states[i], 0.0)

            # Rolling cov up to (train + t)
            global_index = len(df_train) + t
            cov_t = get_rolling_cov(combined_df, global_index, halflife=halflife)

            # Build the BL model
            bl = BlackLittermanModel(
                cov_matrix=cov_t,
                pi="equal",
                absolute_views=dict(zip(assets, view_vector)),
                tau=bl_tau,
                risk_aversion=2.5
            )
            # Posterior returns
            bl_rets = bl.bl_returns()

            # Solve the EF with max_sharpe
            ef = EfficientFrontier(bl_rets, cov_t, weight_bounds=(0, 1), solver="SCS")
            try:
                w_dict = ef.max_sharpe(risk_free_rate=risk_free_rate)
            except ValueError as e:
                # Fallback to equal-weight if solver fails
                fallback_records.append({
                    "time": t,
                    "predicted_regime": current_states.copy(),
                    "absolute_views": dict(zip(assets, view_vector)),
                    "posterior_views": bl_rets.copy() if isinstance(bl_rets, np.ndarray) else bl_rets,
                })
                w_dict = {a: 1.0 / n_assets for a in assets}
            w_array = np.array([w_dict[a] for a in assets])
        else:
            w_array = w_prev.copy()

        # Trading cost
        traded_fraction = np.sum(np.abs(w_array - w_prev))
        cost = gross_growth * traded_fraction * transaction_cost

        # New portfolio value
        portfolio_vals[t] = gross_growth - cost
        weight_history[t] = w_array
        w_prev = w_array

    return portfolio_vals, weight_history, fallback_records


# --- 4. Backtesting & performance ---
def backtest_portfolio(returns, weights, transaction_cost=0.0007):
    """
    Simple hold & rebal once at t=0. Deduct cost on initial notional only.
    """
    T = len(returns)
    if T == 0:
        return np.array([])

    portfolio_vals = np.zeros(T)
    # Initial cost
    cost_init = np.sum(np.abs(weights)) * transaction_cost
    portfolio_vals[0] = 1.0 - cost_init

    for t in range(T - 1):
        ret_t = returns.iloc[t].values
        portfolio_vals[t + 1] = portfolio_vals[t] * (1.0 + np.dot(weights, ret_t))

    return portfolio_vals

def compute_performance_metrics(portfolio_vals, weight_history=None, annual_factor=250):
    """
    Calculates standard portfolio metrics (Sharpe, Sortino, etc.).
    """
    pv = np.asarray(portfolio_vals)
    if len(pv) < 2:
        # Not enough data points to compute real metrics
        return {
            "Annualized Return": np.nan,
            "Cumulative Return": np.nan,
            "Volatility": np.nan,
            "Downside Deviation": np.nan,
            "Max Drawdown": np.nan,
            "Sharpe Ratio": np.nan,
            "Sortino Ratio": np.nan,
            "Calmar Ratio": np.nan,
            "Turnover Rate": np.nan,
        }

    rets = np.diff(pv) / pv[:-1]
    ann_ret = rets.mean() * annual_factor
    cum_ret = pv[-1] / pv[0] - 1
    ann_vol = rets.std() * np.sqrt(annual_factor)

    negative_rets = rets[rets < 0]
    ddev = (negative_rets.std() * np.sqrt(annual_factor)) if len(negative_rets) > 0 else 0.0

    max_dd = (pv / np.maximum.accumulate(pv) - 1).min()
    sharpe = ann_ret / (ann_vol + 1e-12)
    sortino = ann_ret / ddev if ddev > 1e-12 else np.nan
    calmar  = ann_ret / abs(max_dd) if max_dd < 0 else np.nan

    # Average turnover
    if weight_history is not None and len(weight_history) > 1:
        turnovers = []
        for t in range(1, len(weight_history)):
            turnovers.append(np.sum(np.abs(weight_history[t] - weight_history[t-1])))
        avg_turnover = np.mean(turnovers)
    else:
        avg_turnover = 0.0

    return {
        "Annualized Return": ann_ret,
        "Cumulative Return": cum_ret,
        "Volatility": ann_vol,
        "Downside Deviation": ddev,
        "Max Drawdown": max_dd,
        "Sharpe Ratio": sharpe,
        "Sortino Ratio": sortino,
        "Calmar Ratio": calmar,
        "Turnover Rate": avg_turnover,
    }

def equal_weight_allocation(n_assets):
    """ Returns an equal-weight allocation vector. """
    return np.ones(n_assets) / n_assets


# --- 5. Allocation simulation using only EW and SJM-BL ---
def run_allocation_simulation(
    df,
    lam_sjm=50,
    risk_free_rate=0.02/252,
    transaction_cost=0.0007,
    bl_tau=0.05
):
    """
    Splits df 80/20, trains SJM, predicts states, then runs two strategies:
    1) Equal-weight
    2) SJM-BL
    Returns performance dict and fallback events.
    """
    # 1) Check the columns
    missing_cols = [col for col in ASSETS if col not in df.columns]
    if missing_cols:
        raise ValueError(f"DataFrame is missing required columns: {missing_cols}")

    # 2) Split check
    split_idx = int(len(df) * 0.8)
    if split_idx < 1 or (len(df) - split_idx) < 1:
        raise ValueError("Not enough data in df for an 80/20 train-test split.")

    df_train = df.iloc[:split_idx]
    df_test  = df.iloc[split_idx:]
    T_test = len(df_test)

    # Train SJM for each asset
    sjm_models = []
    sjm_clippers = []
    sjm_scalers = []
    sjm_states_train = np.zeros((split_idx, N_ASSETS), dtype=int)
    for i, asset in enumerate(ASSETS):
        series_train = df_train[asset]
        sjm_mod, sjm_clip, sjm_scale = train_sjm_single_asset(
            series_train,
            n_components=2,
            max_feats=4,
            lam=lam_sjm
        )
        feats_train = compute_sjm_features(series_train)
        feats_train = feats_train.replace([np.inf, -np.inf], np.nan).fillna(0.0)

        X_train_clip = sjm_clip.transform(feats_train)
        X_train_scl  = sjm_scale.transform(X_train_clip)
        st_sjm = sjm_mod.predict(X_train_scl)

        sjm_models.append(sjm_mod)
        sjm_clippers.append(sjm_clip)
        sjm_scalers.append(sjm_scale)
        sjm_states_train[:, i] = st_sjm

    # Compute regime means from training data
    sjm_regime_means = []
    for i in range(N_ASSETS):
        asset_train = df_train.iloc[:, i]
        m_sjm, _ = get_regime_means_stds_single_asset(asset_train, sjm_states_train[:, i])
        sjm_regime_means.append(m_sjm)

    # Predict SJM states for the test period (using an expanding window approach)
    sjm_states_test = np.zeros((T_test, N_ASSETS), dtype=int)
    full_series = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)
    for i, asset in enumerate(ASSETS):
        asset_full = full_series[asset]
        feats_full = compute_sjm_features(asset_full)
        feats_full = feats_full.replace([np.inf, -np.inf], np.nan).fillna(0.0)

        X_full_clip = sjm_clippers[i].transform(feats_full)
        X_full_scl  = sjm_scalers[i].transform(X_full_clip)

        for t in range(T_test):
            end_idx = split_idx + t + 1  # end index in the full data
            partial_X = X_full_scl[:end_idx]
            partial_states = sjm_models[i].predict(partial_X)
            # Use the last predicted state for time t without negative indexing
            last_index = len(partial_states) - 1
            sjm_states_test[t, i] = partial_states[last_index]

    # 1) Equal Weight strategy
    w_ew = equal_weight_allocation(N_ASSETS)
    pv_ew = backtest_portfolio(df_test, w_ew, transaction_cost=transaction_cost)
    w_hist_ew = np.tile(w_ew, (T_test, 1))

    # 2) SJM-BL strategy
    pv_sjmbl, w_sjmbl, fallback_sjmbl = regime_based_bl_backtest_flatprior(
        df_train,
        df_test,
        sjm_states_test,
        sjm_states_train[split_idx - 1, :],  # use the last training row explicitly
        sjm_regime_means,
        transaction_cost=transaction_cost,
        risk_free_rate=risk_free_rate,
        bl_tau=bl_tau,
        halflife=126
    )

    perf = {
        "EW": compute_performance_metrics(pv_ew, w_hist_ew),
        "SJM-BL": compute_performance_metrics(pv_sjmbl, w_sjmbl)
    }
    fallback_events = {"SJM-BL": fallback_sjmbl}
    return perf, fallback_events



# --- 6. Monte Carlo simulation for a given lambda ---
def single_simulation_run(run_id, T_sim=1000, lam_sjm=50, risk_free_rate=0.02/252,
                          transaction_cost=0.0007, bl_tau=0.05):
    """
    Creates a 2-state dataset, runs the two strategies (EW & SJM-BL),
    returns the performance results.
    """
    seed = run_id * 1000 + 100  # reproducibility
    df_sim, _ = simulate_2state_data(T_sim, seed=seed)
    return run_allocation_simulation(df_sim, lam_sjm, risk_free_rate, transaction_cost, bl_tau)

def run_monte_carlo_for_lambda(n_runs=8, T_sim=1000, lam_sjm=50, risk_free_rate=0.02/252,
                               transaction_cost=0.0007, bl_tau=0.05):
    """
    Runs multiple simulations in parallel for a given lambda,
    aggregates results.
    """
    # Use up to 6 cores
    results = Parallel(n_jobs=6)(
        delayed(single_simulation_run)(
            i+1, T_sim, lam_sjm, risk_free_rate, transaction_cost, bl_tau
        )
        for i in range(n_runs)
    )

    strategies = ["EW", "SJM-BL"]
    metric_keys = [
        "Annualized Return", "Cumulative Return", "Volatility",
        "Downside Deviation", "Max Drawdown", "Sharpe Ratio",
        "Sortino Ratio", "Calmar Ratio", "Turnover Rate"
    ]
    all_metrics = {
        st: {m: [] for m in metric_keys} for st in strategies
    }

    # Collect performance metrics
    for perf, _ in results:
        for st in strategies:
            for mkey in metric_keys:
                all_metrics[st][mkey].append(perf[st][mkey])

    return all_metrics, results

if __name__ == "__main__":
    lambda_grid = [10, 25, 50, 75, 100, 125]  # Grid of lambda values
    T_sim = 5000
    n_simulations = 8

    summary_results = {}
    for lam in lambda_grid:
        print(f"\nRunning simulations for lambda = {lam}...")
        metrics, _ = run_monte_carlo_for_lambda(
            n_runs=n_simulations,
            T_sim=T_sim,
            lam_sjm=lam
        )
        summary_results[lam] = metrics

        # Print summary stats
        for strategy, mdict in metrics.items():
            print(f"\nStrategy: {strategy} (lambda = {lam})")
            for mkey, values in mdict.items():
                mean_val = np.nanmean(values)
                std_val  = np.nanstd(values)
                print(f"{mkey}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")



Running simulations for lambda = 10...

Strategy: EW (lambda = 10)
Annualized Return: Mean = 0.1228, Std = 0.0338
Cumulative Return: Mean = 0.6276, Std = 0.2258
Volatility: Mean = 0.0792, Std = 0.0048
Downside Deviation: Mean = 0.0470, Std = 0.0034
Max Drawdown: Mean = -0.0946, Std = 0.0306
Sharpe Ratio: Mean = 1.5541, Std = 0.4304
Sortino Ratio: Mean = 2.6308, Std = 0.7540
Calmar Ratio: Mean = 1.5287, Std = 0.7944
Turnover Rate: Mean = 0.0000, Std = 0.0000

Strategy: SJM-BL (lambda = 10)
Annualized Return: Mean = 0.1349, Std = 0.0333
Cumulative Return: Mean = 0.7069, Std = 0.2261
Volatility: Mean = 0.0808, Std = 0.0041
Downside Deviation: Mean = 0.0484, Std = 0.0032
Max Drawdown: Mean = -0.0908, Std = 0.0301
Sharpe Ratio: Mean = 1.6643, Std = 0.3753
Sortino Ratio: Mean = 2.7832, Std = 0.6583
Calmar Ratio: Mean = 1.6924, Std = 0.7198
Turnover Rate: Mean = 0.0041, Std = 0.0008

Running simulations for lambda = 25...

Strategy: EW (lambda = 25)
Annualized Return: Mean = 0.1228, Std = 0.