In [1]:
# ============================================================
# PART C (FIXED v2): Strategy Simulation + Predictive Model
# Fixes over v1:
#   1. Train/test split by ROW INDEX not by date → more training data
#   2. Separated high_leverage and consistent_winners clusters
#   3. Added 14-day rolling features for better signal
#   4. Added SMOTE-style class balancing via class_weight
#   5. Lowered prediction threshold (0.45) to improve class 0 recall
#   6. Added cross-validation score for reliability check
# ============================================================

import os
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score,
                             confusion_matrix, classification_report)
from sklearn.model_selection import cross_val_score, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

# -------------------- Config --------------------
LEVERAGE_CAP      = 3.0
SIZE_INCREASE_PCT = 0.20
LEVERAGE_CLIP_PCT = 0.99
SCALE_MIN, SCALE_MAX = 0.2, 5.0
RANDOM_STATE      = 42
PRED_THRESHOLD    = 0.45   # lower threshold to improve class 0 detection
# ------------------------------------------------

os.makedirs("outputs", exist_ok=True)

def pick(*paths):
    for p in paths:
        if p and os.path.exists(p):
            return p
    return None

TRADES_FILE     = pick("outputs/trades_cleaned.csv",       "outputs/trades.csv")
DAILY_FILE      = pick("outputs/daily_metrics.csv",        "outputs/daily.csv")
DAILY_ACCT_FILE = pick("outputs/daily_account_metrics.csv")
ACCT_SEG_FILE   = pick("outputs/account_segments.csv",     "outputs/account_segments_fixed.csv")

missing = [n for n, f in [("TRADES", TRADES_FILE), ("DAILY", DAILY_FILE),
                            ("DAILY_ACCT", DAILY_ACCT_FILE), ("ACCT_SEG", ACCT_SEG_FILE)] if not f]
if missing:
    raise FileNotFoundError(f"Required output files missing: {missing}. Run Part A & B first.")

print("Files in use:")
for label, path in [("TRADES", TRADES_FILE), ("DAILY", DAILY_FILE),
                     ("DAILY_ACCT", DAILY_ACCT_FILE), ("ACCT_SEG", ACCT_SEG_FILE)]:
    print(f"  {label}: {path}")

# -------------------- Load --------------------
trades        = pd.read_csv(TRADES_FILE,     low_memory=False)
daily         = pd.read_csv(DAILY_FILE,      low_memory=False)
daily_account = pd.read_csv(DAILY_ACCT_FILE, low_memory=False)
acct_seg      = pd.read_csv(ACCT_SEG_FILE,   low_memory=False)

# -------------------- Normalize dates --------------------
def to_date_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            parsed = pd.to_datetime(df[c], errors='coerce')
            if parsed.notna().sum() > 0:
                return parsed.dt.normalize()
    raise ValueError("No parsable date column found among: " + str(candidates))

daily['date']         = to_date_col(daily,         ['date', 'day'])
daily_account['date'] = to_date_col(daily_account, ['date'])

if 'date' in trades.columns:
    trades['date'] = to_date_col(trades, ['date'])
elif 'datetime' in trades.columns:
    trades['date'] = pd.to_datetime(trades['datetime'], errors='coerce').dt.normalize()
else:
    raise ValueError("trades has neither 'date' nor 'datetime' column.")

# -------------------- Validate --------------------
if 'account' not in trades.columns or 'account' not in acct_seg.columns:
    raise ValueError("Missing 'account' column in trades or account_segments.")

# -------------------- Numeric conversions --------------------
trades['closed_pnl'] = pd.to_numeric(
    trades.get('closed_pnl', trades.get('pnl', pd.Series(0, index=trades.index))),
    errors='coerce').fillna(0.0)

size_col = next((c for c in ['size_usd','size','size_tokens','abs_size','notional']
                 if c in trades.columns), None)
lev_col  = next((c for c in ['leverage','lev','margin','leverage_ratio']
                 if c in trades.columns), None)

print("Detected size_col:", size_col, "| leverage_col:", lev_col)

trades['old_leverage'] = pd.to_numeric(trades[lev_col],  errors='coerce') if lev_col  else np.nan
trades['old_size']     = pd.to_numeric(trades[size_col], errors='coerce') if size_col else np.nan

trades.loc[trades['old_leverage'] == 0, 'old_leverage'] = np.nan
trades.loc[trades['old_size']     == 0, 'old_size']     = np.nan

if trades['old_leverage'].notna().sum() > 0:
    lev_cap_val = trades['old_leverage'].quantile(LEVERAGE_CLIP_PCT)
    trades['old_leverage_clipped'] = trades['old_leverage'].clip(lower=1.0, upper=lev_cap_val)
else:
    trades['old_leverage_clipped'] = 1.0

trades['old_size_filled']     = trades['old_size'].fillna(1.0)
trades['old_leverage_filled'] = trades['old_leverage_clipped'].fillna(1.0)

# -------------------- Merge cluster --------------------
acct_seg_small = acct_seg[['account','cluster']].drop_duplicates()
trades = trades.merge(acct_seg_small, on='account', how='left')
trades['cluster'] = trades['cluster'].fillna(-1).astype(int)

# -------------------- Identify clusters --------------------
feat_for_cluster = [f for f in ['avg_leverage','avg_win_rate','avg_trades_per_day']
                    if f in acct_seg.columns]
if not feat_for_cluster:
    feat_for_cluster = acct_seg.select_dtypes(include=[np.number]).columns[:2].tolist()

cluster_means = acct_seg.groupby('cluster')[feat_for_cluster].mean().reset_index()
print("\nCluster means:")
print(cluster_means.to_string(index=False))

# High leverage cluster
lev_sort_col = 'avg_leverage' if 'avg_leverage' in cluster_means.columns else feat_for_cluster[0]
sorted_by_lev = cluster_means.sort_values(lev_sort_col, ascending=False)
high_leverage_cluster = int(sorted_by_lev.iloc[0]['cluster'])

# FIX: consistent winners must be a DIFFERENT cluster from high_leverage
if 'avg_win_rate' in cluster_means.columns:
    sorted_by_wr = cluster_means.sort_values('avg_win_rate', ascending=False)
    for _, row in sorted_by_wr.iterrows():
        if int(row['cluster']) != high_leverage_cluster:
            consistent_winners_cluster = int(row['cluster'])
            break
    else:
        consistent_winners_cluster = int(sorted_by_wr.iloc[0]['cluster'])
elif 'total_pnl' in acct_seg.columns:
    pnl_means = acct_seg.groupby('cluster')['total_pnl'].mean().reset_index().sort_values('total_pnl', ascending=False)
    for _, row in pnl_means.iterrows():
        if int(row['cluster']) != high_leverage_cluster:
            consistent_winners_cluster = int(row['cluster'])
            break
else:
    all_clusters = cluster_means['cluster'].tolist()
    consistent_winners_cluster = next((c for c in all_clusters if c != high_leverage_cluster),
                                       high_leverage_cluster)

print(f"\nClusters → high_leverage: {high_leverage_cluster} | consistent_winners: {consistent_winners_cluster}")

# -------------------- Map sentiment --------------------
if 'sentiment' in trades.columns:
    trades.drop(columns=['sentiment'], inplace=True)

sent_map = daily.set_index('date')['sentiment'].to_dict()
trades['date_norm'] = pd.to_datetime(trades['date']).dt.normalize()
trades['sentiment'] = trades['date_norm'].map(sent_map).ffill().bfill()

# -------------------- Apply strategy rules --------------------
mask_r1 = ((trades['cluster'] == high_leverage_cluster) &
            (trades['sentiment'].str.lower() == 'fear'))
trades['new_leverage'] = trades['old_leverage_filled'].copy()
trades.loc[mask_r1, 'new_leverage'] = np.minimum(
    trades.loc[mask_r1, 'old_leverage_filled'], LEVERAGE_CAP)

acct_win = (daily_account.groupby('account')['win_rate']
            .mean().reset_index().rename(columns={'win_rate': 'acct_win_rate'}))
trades = trades.merge(acct_win, on='account', how='left')

mask_r2 = ((trades['cluster'] == consistent_winners_cluster) &
            (trades['sentiment'].str.lower() == 'greed') &
            (trades['acct_win_rate'] > 0.5))
trades['new_size'] = trades['old_size_filled'].copy()
trades.loc[mask_r2, 'new_size'] = trades.loc[mask_r2, 'old_size_filled'] * (1 + SIZE_INCREASE_PCT)

trades['new_leverage'] = trades['new_leverage'].fillna(trades['old_leverage_filled'])
trades['new_size']     = trades['new_size'].fillna(trades['old_size_filled'])

# -------------------- Scaling + adjusted PnL --------------------
epsilon = 1e-9
trades['scale_raw']      = ((trades['new_leverage'] * trades['new_size']) /
                             (trades['old_leverage_filled'] * trades['old_size_filled'] + epsilon))
trades['scale']          = trades['scale_raw'].clip(lower=SCALE_MIN, upper=SCALE_MAX)
trades['closed_pnl_adj'] = trades['closed_pnl'] * trades['scale']

# -------------------- Aggregate results --------------------
orig_daily = trades.groupby('date_norm')['closed_pnl'].sum().reset_index(name='orig_total_pnl')
adj_daily  = trades.groupby('date_norm')['closed_pnl_adj'].sum().reset_index(name='adj_total_pnl')
compare_daily = (orig_daily.merge(adj_daily, on='date_norm', how='outer')
                 .fillna(0).sort_values('date_norm').reset_index(drop=True))
compare_daily['delta'] = compare_daily['adj_total_pnl'] - compare_daily['orig_total_pnl']

tot_orig  = compare_daily['orig_total_pnl'].sum()
tot_adj   = compare_daily['adj_total_pnl'].sum()
delta_tot = tot_adj - tot_orig

orig_acct = trades.groupby('account')['closed_pnl'].sum().reset_index(name='orig_pnl')
adj_acct  = trades.groupby('account')['closed_pnl_adj'].sum().reset_index(name='adj_pnl')
acct_compare = (orig_acct.merge(adj_acct, on='account', how='outer')
                .fillna(0).assign(delta=lambda d: d['adj_pnl'] - d['orig_pnl'])
                .sort_values('delta'))

orig_vol = trades.groupby('date_norm')['closed_pnl'].sum().std()
adj_vol  = trades.groupby('date_norm')['closed_pnl_adj'].sum().std()

compare_daily.to_csv("outputs/strategy_simulation_daily_impact_partc_final.csv", index=False)
trades.to_csv("outputs/trades_sim_full_partc_final.csv", index=False)
acct_compare.to_csv("outputs/account_impact_partc_final.csv", index=False)

print("\nSIMULATION SUMMARY")
print("------------------")
print(f"Total orig pnl : {tot_orig:,.2f}")
print(f"Total adj  pnl : {tot_adj:,.2f}")
print(f"Delta          : {delta_tot:,.2f}")
print(f"Orig daily vol : {orig_vol:,.2f}")
print(f"Adj  daily vol : {adj_vol:,.2f}")
print("\nTop 5 accounts HURT:")
print(acct_compare.head(5).to_string(index=False))
print("\nTop 5 accounts HELPED:")
print(acct_compare.tail(5).to_string(index=False))

# Cumulative PnL plot
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(compare_daily['date_norm'], compare_daily['orig_total_pnl'].cumsum(), label='Original')
ax.plot(compare_daily['date_norm'], compare_daily['adj_total_pnl'].cumsum(),  label='Adjusted')
ax.set_title('Cumulative PnL: Original vs Strategy-Adjusted')
ax.legend(); plt.tight_layout()
plt.savefig("outputs/simulation_cumpnl_partc_final.png", dpi=150)
plt.close()

# ============================================================
# Predictive Model — FIXED
# ============================================================
df = daily_account.copy()
df['date'] = pd.to_datetime(df['date']).dt.normalize()
df = df.sort_values(['account','date']).reset_index(drop=True)

df['next_daily_pnl']         = df.groupby('account')['daily_pnl'].shift(-1)
df['target_next_profitable'] = (df['next_daily_pnl'] > 0).astype(int)

if 'sentiment' in df.columns:
    df.drop(columns=['sentiment'], inplace=True)
df = df.merge(daily[['date','sentiment']].drop_duplicates(), on='date', how='left')
df['sentiment_num'] = df['sentiment'].map({'Fear': -1, 'Greed': 1}).fillna(0)

for col in ['trades_count','avg_leverage','win_rate','avg_trade_size']:
    df[col] = pd.to_numeric(df.get(col, 0), errors='coerce').fillna(0)

# FIX: added 14-day rolling features for more signal
for feat, src in [('rol_trades_7','trades_count'),   ('rol_leverage_7','avg_leverage'),
                   ('rol_winrate_7','win_rate'),       ('rol_size_7','avg_trade_size'),
                   ('rol_trades_14','trades_count'),   ('rol_leverage_14','avg_leverage'),
                   ('rol_winrate_14','win_rate'),      ('rol_pnl_7','daily_pnl')]:
    window = 14 if '14' in feat else 7
    df[feat] = df.groupby('account')[src].transform(
        lambda s: s.rolling(window, min_periods=1).mean())

feat_cols = ['sentiment_num','rol_trades_7','rol_leverage_7','rol_winrate_7',
             'rol_size_7','rol_trades_14','rol_leverage_14','rol_winrate_14','rol_pnl_7']

df_model = df.dropna(subset=feat_cols + ['target_next_profitable']).copy().reset_index(drop=True)
print(f"\nModel dataset size: {df_model.shape[0]} rows")
print("Class distribution:", df_model['target_next_profitable'].value_counts().to_dict())

if df_model.shape[0] < 20:
    print("WARNING: dataset too small. Skipping model.")
else:
    # FIX: split by ROW INDEX (80/20) not by date → guarantees enough training rows
    split_idx  = int(len(df_model) * 0.80)
    X_train    = df_model.loc[:split_idx-1, feat_cols].values
    y_train    = df_model.loc[:split_idx-1, 'target_next_profitable'].values
    X_test     = df_model.loc[split_idx:,   feat_cols].values
    y_test     = df_model.loc[split_idx:,   'target_next_profitable'].values

    print(f"Train rows: {len(X_train)} | Test rows: {len(X_test)}")

    if len(X_train) == 0 or len(X_test) == 0:
        print("WARNING: split produced empty set. Skipping model.")
    else:
        scaler    = StandardScaler()
        X_train_s = scaler.fit_transform(np.nan_to_num(X_train))
        X_test_s  = scaler.transform(np.nan_to_num(X_test))

        clf = RandomForestClassifier(
            n_estimators=300,
            max_depth=6,
            min_samples_leaf=3,
            random_state=RANDOM_STATE,
            class_weight='balanced',
            n_jobs=-1)
        clf.fit(X_train_s, y_train)

        # FIX: use threshold 0.45 instead of 0.5 to improve class 0 recall
        y_proba = clf.predict_proba(X_test_s)[:, 1]
        y_pred  = (y_proba >= PRED_THRESHOLD).astype(int)

        acc = accuracy_score(y_test, y_pred)
        f1  = f1_score(y_test, y_pred, zero_division=0)
        auc = roc_auc_score(y_test, y_proba) if len(np.unique(y_test)) > 1 else float('nan')
        cm  = confusion_matrix(y_test, y_pred)

        # Cross-validation on full dataset
        cv = StratifiedKFold(n_splits=min(5, len(df_model)//10 or 2), shuffle=False)
        cv_scores = cross_val_score(clf, scaler.fit_transform(np.nan_to_num(df_model[feat_cols].values)),
                                    df_model['target_next_profitable'].values,
                                    cv=cv, scoring='roc_auc')

        print("\nPREDICTIVE MODEL SUMMARY")
        print("------------------------")
        print(f"Train rows : {len(X_train)} | Test rows: {len(X_test)}")
        print(f"Accuracy   : {acc:.4f}")
        print(f"F1 Score   : {f1:.4f}")
        print(f"AUC        : {auc:.4f}" if not np.isnan(auc) else "AUC: n/a")
        print(f"CV AUC     : {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
        print("Confusion matrix:\n", cm)
        print("\nClassification report:\n",
              classification_report(y_test, y_pred, zero_division=0))

        feat_imp = (pd.DataFrame({'feature': feat_cols, 'importance': clf.feature_importances_})
                    .sort_values('importance', ascending=False))
        feat_imp.to_csv("outputs/model_feature_importances_partc_final.csv", index=False)
        df_model[['account','date'] + feat_cols + ['target_next_profitable']].to_csv(
            "outputs/model_dataset_partc_final.csv", index=False)

        with open("outputs/model_report_partc_final.txt", "w") as fh:
            fh.write(f"Accuracy : {acc:.4f}\nF1: {f1:.4f}\nAUC: {auc}\n")
            fh.write(f"CV AUC   : {cv_scores.mean():.4f} +/- {cv_scores.std():.4f}\n")
            fh.write("Confusion matrix:\n" + np.array2string(cm))
            fh.write("\n\nClassification report:\n")
            fh.write(classification_report(y_test, y_pred, zero_division=0))

        fig, ax = plt.subplots(figsize=(8, 5))
        ax.barh(feat_imp['feature'], feat_imp['importance'], color='steelblue')
        ax.set_xlabel('Importance'); ax.set_title('Feature Importances')
        ax.invert_yaxis(); plt.tight_layout()
        plt.savefig("outputs/model_feature_importances_partc_final.png", dpi=150)
        plt.close()
        print("Saved model_feature_importances_partc_final.png")

# -------------------- Summary --------------------
pd.DataFrame({
    'total_orig_pnl': [tot_orig], 'total_adj_pnl': [tot_adj],
    'delta_total': [delta_tot], 'orig_daily_vol': [orig_vol], 'adj_daily_vol': [adj_vol],
    'high_leverage_cluster': [high_leverage_cluster],
    'consistent_winners_cluster': [consistent_winners_cluster],
    'leverage_cap_used': [LEVERAGE_CAP], 'size_increase_pct': [SIZE_INCREASE_PCT],
}).to_csv("outputs/strategy_simulation_summary_partc_final.csv", index=False)

print("\nAll outputs saved to outputs/")


Files in use:
  TRADES: outputs/trades_cleaned.csv
  DAILY: outputs/daily_metrics.csv
  DAILY_ACCT: outputs/daily_account_metrics.csv
  ACCT_SEG: outputs/account_segments.csv
Detected size_col: size_usd | leverage_col: leverage

Cluster means:
 cluster   avg_leverage  avg_win_rate  avg_trades_per_day
       0   24145.189812      0.383982         1532.886139
       1   28664.002767      0.586406        11995.333333
       2 -445764.113390      0.276284         2641.583333

Clusters → high_leverage: 1 | consistent_winners: 0

SIMULATION SUMMARY
------------------
Total orig pnl : 10,296,958.94
Total adj  pnl : 9,496,464.51
Delta          : -800,494.44
Orig daily vol : 2,581,231.98
Adj  daily vol : 2,313,607.15

Top 5 accounts HURT:
                                   account      orig_pnl       adj_pnl         delta
0xbaaaf6571ab7d571043ff1e313a9609a10637864  9.401638e+05  1.880380e+05 -7.521258e+05
0xbee1707d6b44d4d52bfe19e41f8a828645437aab  8.360806e+05  7.697635e+05 -6.631705e+04
0xb12