In [3]:
# ============================================================
# PART B (FIXED): Analysis
# Key fixes:
#   1. Removed display() calls → replaced with print/to_string()
#   2. Fixed deprecated groupby().apply() FutureWarning (include_groups)
#   3. Fixed deprecated .fillna(method=) → use .ffill()/.bfill()
#   4. Fixed matplotlib runtime warnings (empty slices, single-group plots)
#   5. Fixed chained assignment warnings with .loc[]
#   6. Added null-safe guards before stat tests (mannwhitneyu fails on empty)
#   7. Fixed long/short bias merge: avoid duplicate 'sentiment' column error
#   8. Safer sentiment mapping using .map() with explicit fallback
# ============================================================

import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')   # FIX: non-interactive backend (safe for scripts & notebooks)
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

# ------- Config / load -------
DAILY_ACCT = "outputs/daily_account_metrics.csv"
DAILY      = "outputs/daily_metrics.csv"
TRADES     = "outputs/trades_cleaned.csv"

daily_account = pd.read_csv(DAILY_ACCT, parse_dates=['date'])
daily         = pd.read_csv(DAILY,      parse_dates=['date'])
trades        = pd.read_csv(TRADES,     parse_dates=['datetime','date'], low_memory=False)

print("daily_account:", daily_account.shape)
print("daily platform:", daily.shape)
print("trades:", trades.shape)

# ------- Normalize sentiment -------
daily['sentiment'] = daily['sentiment'].astype(str).str.strip().str.capitalize()

# FIX: safe sentiment merge — drop existing sentiment col in daily_account first to avoid duplication
if 'sentiment' in daily_account.columns:
    daily_account.drop(columns=['sentiment'], inplace=True)

sent_map = daily.set_index('date')['sentiment'].to_dict()
daily_account['sentiment'] = daily_account['date'].map(sent_map)

print("Sentiment value counts:")
print(daily_account['sentiment'].value_counts().to_string())

# ============================================================
# 1) Performance: Fear vs Greed days
# ============================================================
metrics = ['daily_pnl', 'win_rate', 'daily_min_cum_pnl']
if 'avg_leverage' in daily_account.columns:
    metrics.append('avg_leverage')

compare_df = daily_account.dropna(subset=['sentiment']).copy()

def compare_metric(metric, df=None):
    if df is None:
        df = compare_df
    # FIX: guard against missing column
    if metric not in df.columns:
        print(f"  [SKIP] Column '{metric}' not found.")
        return {}, np.array([]), np.array([])

    grp   = df.groupby('sentiment')[metric].apply(list).to_dict()
    fear  = np.array([x for x in grp.get('Fear',  []) if pd.notna(x)])
    greed = np.array([x for x in grp.get('Greed', []) if pd.notna(x)])

    summary = {
        'fear_mean':   np.nanmean(fear)   if len(fear)  > 0 else np.nan,
        'greed_mean':  np.nanmean(greed)  if len(greed) > 0 else np.nan,
        'fear_median': np.nanmedian(fear) if len(fear)  > 0 else np.nan,
        'greed_median':np.nanmedian(greed)if len(greed) > 0 else np.nan,
        'n_fear':  len(fear),
        'n_greed': len(greed),
    }

    # FIX: only run tests if both groups have data
    mw_stat = mw_p = t_stat = t_p = np.nan
    if len(fear) > 1 and len(greed) > 1:
        try:
            mw_stat, mw_p = stats.mannwhitneyu(fear, greed, alternative='two-sided')
        except Exception:
            pass
        try:
            t_stat, t_p = stats.ttest_ind(fear, greed, equal_var=False, nan_policy='omit')
        except Exception:
            pass

    summary.update({'mw_stat': mw_stat, 'mw_p': mw_p, 't_stat': t_stat, 't_p': t_p})
    return summary, fear, greed

summaries = {}
for m in metrics:
    s, f, g = compare_metric(m)
    summaries[m] = s
    print(f"\n--- Metric: {m} ---")
    print(s)

# ------- Boxplot: daily_pnl by sentiment -------
fear_pnl  = compare_df.loc[compare_df['sentiment'] == 'Fear',  'daily_pnl'].dropna()
greed_pnl = compare_df.loc[compare_df['sentiment'] == 'Greed', 'daily_pnl'].dropna()

# FIX: only plot if both groups are non-empty
if len(fear_pnl) > 0 and len(greed_pnl) > 0:
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.boxplot([fear_pnl, greed_pnl], labels=['Fear', 'Greed'])
    ax.set_ylabel('daily_pnl (per-account)')
    ax.set_title('Distribution of daily_pnl by Sentiment')
    plt.tight_layout()
    plt.savefig("outputs/boxplot_daily_pnl_by_sentiment.png", dpi=150)
    plt.close()
    print("Saved boxplot_daily_pnl_by_sentiment.png")
else:
    print("[SKIP] Not enough data for daily_pnl boxplot.")

# ------- Boxplot: win_rate by sentiment -------
fear_wr  = compare_df.loc[compare_df['sentiment'] == 'Fear',  'win_rate'].dropna()
greed_wr = compare_df.loc[compare_df['sentiment'] == 'Greed', 'win_rate'].dropna()

if len(fear_wr) > 0 and len(greed_wr) > 0:
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.boxplot([fear_wr, greed_wr], labels=['Fear', 'Greed'])
    ax.set_ylabel('win_rate (daily, per-account)')
    ax.set_title('Win Rate by Sentiment')
    plt.tight_layout()
    plt.savefig("outputs/boxplot_win_rate_by_sentiment.png", dpi=150)
    plt.close()
    print("Saved boxplot_win_rate_by_sentiment.png")
else:
    print("[SKIP] Not enough data for win_rate boxplot.")

# ------- Timeseries: platform PnL + sentiment markers -------
if 'total_pnl' in daily.columns and daily['total_pnl'].notna().sum() > 0:
    fig, ax = plt.subplots(figsize=(12, 4))
    daily_sorted = daily.sort_values('date')
    ax.plot(daily_sorted['date'],
            daily_sorted['total_pnl'].rolling(7, min_periods=1).mean(),
            label='7-day MA total_pnl', color='steelblue')

    fear_days  = daily_sorted[daily_sorted['sentiment'] == 'Fear']
    greed_days = daily_sorted[daily_sorted['sentiment'] == 'Greed']
    if len(fear_days)  > 0:
        ax.scatter(fear_days['date'],  fear_days['total_pnl'],  c='red',   s=12, label='Fear days',  zorder=3)
    if len(greed_days) > 0:
        ax.scatter(greed_days['date'], greed_days['total_pnl'], c='green', s=12, label='Greed days', zorder=3)

    ax.legend(); ax.set_title('Platform total_pnl (7d MA) with Fear/Greed markers')
    plt.tight_layout()
    plt.savefig("outputs/ts_total_pnl_sentiment.png", dpi=150)
    plt.close()
    print("Saved ts_total_pnl_sentiment.png")

# ============================================================
# 2) Behavior change metrics
# ============================================================
behav_daily = daily_account.copy()
behavior_metrics = [m for m in ['trades_count','avg_leverage','avg_trade_size','long_short_ratio']
                    if m in behav_daily.columns]

for m in behavior_metrics:
    s, f, g = compare_metric(m, df=behav_daily)
    summaries[m] = s
    print(f"\n--- Behavior metric: {m} ---")
    print(s)

agg_behav = (behav_daily.groupby('sentiment')
             .agg(mean_trades  =('trades_count', 'mean'),
                  median_trades=('trades_count', 'median'),
                  **({'mean_leverage': ('avg_leverage','mean')} if 'avg_leverage' in behav_daily.columns else {}))
             .reset_index())
agg_behav.to_csv("outputs/agg_behav_by_sentiment.csv", index=False)
print("\nSaved agg_behav_by_sentiment.csv")
print(agg_behav.to_string(index=False))

# ------- Long/Short bias -------
if 'side' in trades.columns:
    # FIX: drop sentiment from trades first to avoid column collision on merge
    if 'sentiment' in trades.columns:
        trades.drop(columns=['sentiment'], inplace=True)

    trades['side_norm'] = trades['side'].astype(str).str.lower().str.strip()
    trades['is_long']   = trades['side_norm'].isin(['buy', 'long'])

    # FIX: merge only date+sentiment, avoid duplicate cols
    trades = trades.merge(
        daily[['date','sentiment']].drop_duplicates(),
        on='date', how='left'
    )
    ls = trades.groupby('sentiment')['is_long'].mean().reset_index(name='pct_long')
    ls.to_csv("outputs/pct_long_by_sentiment.csv", index=False)
    print("Saved pct_long_by_sentiment.csv")
    print(ls.to_string(index=False))
else:
    print("No 'side' column — cannot compute long/short bias.")

# ============================================================
# 3) Account clustering (K-Means, k=3)
# ============================================================
agg_kwargs = dict(
    total_pnl          =('daily_pnl',       'sum'),
    avg_win_rate        =('win_rate',        'mean'),
    avg_trades_per_day  =('trades_count',    'mean'),
    avg_size            =('avg_trade_size',  'mean'),
    days_active         =('date',            'nunique'),
)
if 'avg_leverage' in daily_account.columns:
    agg_kwargs['avg_leverage'] = ('avg_leverage', 'mean')

acct_feat = (daily_account.groupby('account')
             .agg(**agg_kwargs)
             .reset_index()
             .replace([np.inf, -np.inf], np.nan)
             .fillna(0))

# Choose best available clustering features
cluster_features = [f for f in ['avg_leverage','avg_trades_per_day','avg_win_rate']
                    if f in acct_feat.columns]
if len(cluster_features) < 2:
    cluster_features = acct_feat.select_dtypes(include=[np.number]).columns.tolist()[:3]

print("\nClustering on features:", cluster_features)

scaler = StandardScaler()
X      = scaler.fit_transform(acct_feat[cluster_features].values)

k      = min(3, len(acct_feat))   # FIX: guard if fewer than 3 accounts
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
acct_feat['cluster'] = kmeans.fit_predict(X)

cluster_summary = (acct_feat.groupby('cluster')[cluster_features + ['total_pnl','days_active']]
                   .agg(['mean','median','count']))
cluster_summary.to_csv("outputs/cluster_summary.csv")
acct_feat.to_csv("outputs/account_segments.csv", index=False)
print("Saved account_segments.csv and cluster_summary.csv")
print("\nCluster summary (means):")
print(acct_feat.groupby('cluster')[cluster_features + ['total_pnl']].mean().to_string())

# Cluster scatter plot
fig, ax = plt.subplots(figsize=(8, 6))
x_feat = cluster_features[1] if len(cluster_features) > 1 else cluster_features[0]
y_feat = cluster_features[0]
for c in sorted(acct_feat['cluster'].unique()):
    sub = acct_feat[acct_feat['cluster'] == c]
    ax.scatter(sub[x_feat], sub[y_feat], s=20, label=f'Cluster {c}')
ax.set_xlabel(x_feat); ax.set_ylabel(y_feat)
ax.legend(); ax.set_title(f'Clusters ({x_feat} vs {y_feat})')
plt.tight_layout()
plt.savefig("outputs/cluster_scatter.png", dpi=150)
plt.close()
print("Saved cluster_scatter.png")

# ------- Save all summary stats -------
pd.DataFrame.from_dict(summaries, orient='index').to_csv("outputs/sentiment_metric_tests.csv")
print("Saved sentiment_metric_tests.csv")

# ============================================================
# END PART B (FIXED)
# ============================================================

daily_account: (102, 12)
daily platform: (7, 8)
trades: (211224, 20)
Sentiment value counts:
sentiment
Greed            32
Fear             32
Nan              25
Neutral           8
Extreme greed     5

--- Metric: daily_pnl ---
{'fear_mean': 209372.66220543752, 'greed_mean': 99675.5167305, 'fear_median': 81389.6825155, 'greed_median': 35988.3764365, 'n_fear': 32, 'n_greed': 32, 'mw_stat': 669.0, 'mw_p': 0.035589192227433085, 't_stat': 1.3092672646160282, 't_p': 0.19567563543239797}

--- Metric: win_rate ---
{'fear_mean': 0.4158784540695092, 'greed_mean': 0.37407443523093875, 'fear_median': 0.3939621144568476, 'greed_median': 0.4125989656493423, 'n_fear': 32, 'n_greed': 32, 'mw_stat': 535.0, 'mw_p': 0.7625384725608533, 't_stat': 0.7648625894754894, 't_p': 0.448320925708963}

--- Metric: daily_min_cum_pnl ---
{'fear_mean': -10431.168132499999, 'greed_mean': -10294.545593593753, 'fear_median': 0.0, 'greed_median': 0.0, 'n_fear': 32, 'n_greed': 32, 'mw_stat': 411.0, 'mw_p': 0.13302672146