In [7]:
# ============================================================
# PART A (FIXED): Data Preparation


import os
import re
import pandas as pd
import numpy as np

SENTIMENT_PATH = "fear_greed_index.csv"
TRADER_PATH    = "historical_data.csv"
os.makedirs("outputs", exist_ok=True)

#  Load files 
if not os.path.exists(SENTIMENT_PATH):
    raise FileNotFoundError(f"Sentiment file not found at: {SENTIMENT_PATH}")
if not os.path.exists(TRADER_PATH):
    raise FileNotFoundError(f"Trades file not found at: {TRADER_PATH}")

sent   = pd.read_csv(SENTIMENT_PATH)
trades = pd.read_csv(TRADER_PATH, low_memory=False)

print("Loaded: sentiment rows =", len(sent), ", trades rows =", len(trades))

# Document shapes
def doc_df(df, name, n_show=3):
    print(f"\n-- {name} --")
    print("shape:", df.shape)
    print("columns:", df.columns.tolist())
    print("missing (top 10):")
    print(df.isnull().sum().sort_values(ascending=False).head(10).to_string())
    print("duplicate rows:", df.duplicated().sum())
    print(df.head(n_show).to_string())   # FIX: replaced display() with print()

doc_df(sent,   "Sentiment (raw)")
doc_df(trades, "Trades (raw)")

#  Normalize column names 
sent.columns   = sent.columns.str.strip().str.lower().str.replace(r'\s+', '_', regex=True)
trades.columns = (trades.columns
                  .str.strip()
                  .str.replace('\xa0', ' ', regex=False)
                  .str.strip()
                  .str.replace(r'\s+', '_', regex=True)
                  .str.lower())

print("\nNormalized sentiment cols:", list(sent.columns))
print("Normalized trades cols:",    list(trades.columns))

#  Detect sentiment columns 
sent_date_col  = next((c for c in sent.columns if "date"  in c), None)
sent_label_col = next((c for c in sent.columns
                       if any(k in c for k in ["class","sent","classification"])), None)
if sent_date_col is None or sent_label_col is None:
    raise ValueError("Couldn't detect sentiment date/class columns. Found: " + ", ".join(sent.columns))

sent['date']      = pd.to_datetime(sent[sent_date_col], errors='coerce').dt.date
sent['sentiment'] = sent[sent_label_col].astype(str).str.strip().str.capitalize()
sent = sent[['date', 'sentiment']].dropna().drop_duplicates().reset_index(drop=True)
print("\nSentiment normalized: rows =", len(sent),
      "| date range:", sent['date'].min(), "->", sent['date'].max())

#  Detect trade columns 
def pick(cols):
    for c in cols:
        if c in trades.columns:
            return c
    return None

acct_col        = pick(['account','acct','user','client'])
timestamp_col   = pick(['timestamp','timestamp_ist','timestamp_utc','time','ts','datetime'])
closed_pnl_col  = next((c for c in trades.columns if 'closed' in c and 'pnl' in c), None) \
                  or pick(['closed_pnl','realized_pnl','pnl','profit'])
size_usd_col    = pick(['size_usd','sizeusd','size_usd.'])
size_tokens_col = pick(['size_tokens','size_token'])
size_col_any    = size_usd_col or size_tokens_col or pick(['size','qty','quantity'])
side_col        = pick(['side','direction','trade_side'])
leverage_col    = pick(['leverage','lev','leverage_ratio','start_position','margin'])

print("\nDetected trade columns:")
for k, v in [("account", acct_col), ("timestamp", timestamp_col),
              ("closed_pnl", closed_pnl_col), ("size_usd", size_usd_col),
              ("size_tokens", size_tokens_col), ("any_size", size_col_any),
              ("side", side_col), ("leverage", leverage_col)]:
    print(f"  {k}: {v}")

required_missing = [r for r in [acct_col, timestamp_col, closed_pnl_col] if r is None]
if required_missing:
    raise ValueError(f"Missing required columns: {required_missing}. Available: {list(trades.columns)}")

#  Convert numeric fields safely 
trades[closed_pnl_col] = pd.to_numeric(trades[closed_pnl_col], errors='coerce')
if size_col_any:
    trades[size_col_any] = pd.to_numeric(trades[size_col_any], errors='coerce')
if leverage_col and leverage_col in trades.columns:
    trades[leverage_col] = pd.to_numeric(trades[leverage_col], errors='coerce')

#  Robust timestamp parsing 
def try_parse_epoch_unit(series):
    """Try units ns/us/ms/s; return (parsed_series, unit) or (None, None)."""
    snum = pd.to_numeric(series, errors='coerce').dropna()
    if len(snum) == 0:
        return None, None
    sample = snum.sample(min(len(snum), 200), random_state=1)
    best_unit, best_score = None, -1
    for u in ['ns', 'us', 'ms', 's']:
        try:
            #  cast to plain int64 numpy array to avoid overflow with Int64 nullable
            dt_sample = pd.to_datetime(sample.values.astype('int64'), unit=u, errors='coerce')
            score = pd.Series(dt_sample).dt.year.between(2009, 2035).sum()
            if score > best_score:
                best_unit, best_score = u, score
        except Exception:
            continue
    if best_score <= 0:
        return None, None
    try:
        parsed = pd.to_datetime(
            pd.to_numeric(series, errors='coerce').values.astype('float64').astype('int64'),
            unit=best_unit, errors='coerce'
        )
        return pd.Series(parsed, index=series.index), best_unit
    except Exception:
        return None, None

def clean_timestamp_strings(s):
    s = s.astype(str).str.strip()
    s = s.str.replace(r'\s+[A-Z]{2,4}$', '', regex=True)  # strip trailing TZ abbreviations
    s = s.str.replace(',', ' ', regex=False)
    return s

# Primary: numeric epoch detection
parsed_dt, unit_used = try_parse_epoch_unit(trades[timestamp_col])
if parsed_dt is None or parsed_dt.isna().all():
    cleaned   = clean_timestamp_strings(trades[timestamp_col])
    parsed_dt = pd.to_datetime(cleaned, errors='coerce', utc=False)
    method    = "string_clean_parse"
else:
    method = f"epoch_unit_{unit_used}"

# Fallback: try alternate timestamp columns
if parsed_dt.isna().sum() >= len(parsed_dt) * 0.99:
    for alt in ['timestamp_ist', 'timestamp_utc', 'time', 'datetime']:
        if alt in trades.columns and alt != timestamp_col:
            parsed_alt, alt_unit = try_parse_epoch_unit(trades[alt])
            if parsed_alt is not None and parsed_alt.notna().sum() > 0:
                parsed_dt, method = parsed_alt, f"alt_epoch_{alt}_{alt_unit}"
                break
            parsed_alt = pd.to_datetime(
                clean_timestamp_strings(trades[alt]), errors='coerce', utc=False)
            if parsed_alt.notna().sum() > 0:
                parsed_dt, method = parsed_alt, f"alt_string_{alt}"
                break

trades['datetime'] = parsed_dt
print(f"\nTimestamp parsing method: {method}")
print("Parsed non-null:", trades['datetime'].notna().sum(), "of", len(trades))
if trades['datetime'].notna().sum() > 0:
    print("Sample datetimes:", trades['datetime'].head(5).tolist())
    print("Range:", trades['datetime'].min(), "->", trades['datetime'].max())


if trades['datetime'].notna().sum() > 0:
    n_1970   = (trades['datetime'].dt.year == 1970).sum()
    frac_1970 = n_1970 / max(1, trades['datetime'].notna().sum())
    if frac_1970 > 0.4:
        print(f"  [WARN] {n_1970} rows are year 1970 ({frac_1970:.1%}); re-parsing with best numeric unit...")
        snum = pd.to_numeric(trades[timestamp_col], errors='coerce').dropna()
        if len(snum) > 0:
            best_unit, best_valid = None, -1
            for u in ['ns', 'us', 'ms', 's']:
                try:
                    dt_try = pd.to_datetime(snum.values.astype('int64'), unit=u, errors='coerce')
                    valid  = pd.Series(dt_try).dt.year.between(2009, 2035).sum()
                    if valid > best_valid:
                        best_unit, best_valid = u, valid
                except Exception:
                    continue
            if best_unit:
                trades['datetime'] = pd.to_datetime(
                    pd.to_numeric(trades[timestamp_col], errors='coerce')
                      .values.astype('float64').astype('int64'),
                    unit=best_unit, errors='coerce'
                )
                print("  Reparsed unit:", best_unit, "| valid count:", best_valid)
                print("  New range:", trades['datetime'].min(), "->", trades['datetime'].max())

trades['date']  = trades['datetime'].dt.date
before_drop     = len(trades)
trades          = trades.dropna(subset=['date']).reset_index(drop=True)
print(f"Dropped {before_drop - len(trades)} rows (invalid datetime). Remaining: {len(trades)}")
print("Trades date range:", trades['date'].min(), "->", trades['date'].max())

# Rename to canonical column names
rename_map = {}
if acct_col        and acct_col        != 'account':     rename_map[acct_col]        = 'account'
if closed_pnl_col  and closed_pnl_col  != 'closed_pnl':  rename_map[closed_pnl_col]  = 'closed_pnl'
if size_usd_col    and size_usd_col    != 'size_usd':    rename_map[size_usd_col]    = 'size_usd'
if size_tokens_col and size_tokens_col != 'size_tokens': rename_map[size_tokens_col] = 'size_tokens'
if size_col_any    and size_col_any not in ['size_usd','size_tokens','size']:
    rename_map[size_col_any] = 'size'
if side_col        and side_col        != 'side':        rename_map[side_col]        = 'side'
if leverage_col    and leverage_col in trades.columns and leverage_col != 'leverage':
    rename_map[leverage_col] = 'leverage'

trades.rename(columns=rename_map, inplace=True)
canonical = [c for c in ['account','closed_pnl','size_usd','size_tokens','size',
                          'datetime','date','side','leverage'] if c in trades.columns]
print("\nCanonical columns present:", canonical)

trades['is_win'] = trades['closed_pnl'] > 0
if   'size_usd'    in trades.columns: trades['abs_size'] = trades['size_usd'].abs()
elif 'size_tokens' in trades.columns: trades['abs_size'] = trades['size_tokens'].abs()
elif 'size'        in trades.columns: trades['abs_size'] = trades['size'].abs()
else:                                  trades['abs_size'] = np.nan

#  Daily per-account aggregation
agg_dict = {
    'daily_pnl':       ('closed_pnl', 'sum'),
    'trades_count':    ('closed_pnl', 'count'),
    'win_count':       ('is_win',     'sum'),
    'avg_trade_size':  ('abs_size',   'mean'),
}
if 'leverage' in trades.columns:
    agg_dict['avg_leverage'] = ('leverage', 'mean')

daily_account = trades.groupby(['date', 'account']).agg(**agg_dict).reset_index()

# Long / short counts
if 'side' in trades.columns:
    def count_side(s, positives):
        return s.astype(str).str.lower().isin(positives).sum()

    longs  = (trades.groupby(['date','account'])['side']
              .apply(lambda s: count_side(s, {'buy','long'}))    # FIX: no include_groups needed
              .reset_index(name='long_count'))
    shorts = (trades.groupby(['date','account'])['side']
              .apply(lambda s: count_side(s, {'sell','short'}))
              .reset_index(name='short_count'))
    daily_account = daily_account.merge(longs,  on=['date','account'], how='left')
    daily_account = daily_account.merge(shorts, on=['date','account'], how='left')
else:
    daily_account['long_count']  = np.nan
    daily_account['short_count'] = np.nan

daily_account['win_rate']         = daily_account['win_count']  / daily_account['trades_count']
daily_account['long_short_ratio'] = daily_account['long_count'] / (daily_account['short_count'] + 1e-9)

# Intra-day drawdown proxy
def day_drawdown(sub):
    sub  = sub.sort_values('datetime')
    csum = sub['closed_pnl'].cumsum()
    return csum.min() if not csum.empty else np.nan

# FIX: use include_groups=False to suppress FutureWarning
dd = (trades.groupby(['date','account'])
      .apply(day_drawdown, include_groups=False)     # pandas ≥ 2.2 compat
      .reset_index(name='daily_min_cum_pnl'))
daily_account = daily_account.merge(dd, on=['date','account'], how='left')

# ------- Platform / day aggregation -------
platform_agg = {
    'total_pnl':      ('daily_pnl',        'sum'),
    'avg_win_rate':   ('win_rate',          'mean'),
    'avg_trade_size': ('avg_trade_size',    'mean'),
    'total_trades':   ('trades_count',      'sum'),
    'avg_long_short': ('long_short_ratio',  'mean'),
}
if 'avg_leverage' in daily_account.columns:
    platform_agg['avg_leverage'] = ('avg_leverage', 'mean')

daily = daily_account.groupby('date').agg(**platform_agg).reset_index()
daily = daily.merge(sent, on='date', how='left')

# ------- Print Part A Summary -------
print("\n--- Part A Summary ---")
print("Trades:          rows,cols:", trades.shape)
print("Daily_account:   rows,cols:", daily_account.shape)
print("Daily (platform):rows,cols:", daily.shape)

print("\nMissing values (trades) top 10:")
print(trades.isnull().sum().sort_values(ascending=False).head(10).to_string())

print("\nMissing values (daily_account) top 10:")
print(daily_account.isnull().sum().sort_values(ascending=False).head(10).to_string())

print("\nDuplicates: trades =",        trades.duplicated().sum())
print("Duplicates: daily_account =",  daily_account.duplicated().sum())

if 'leverage' in trades.columns:
    print("\nLeverage summary (trades):")
    print(trades['leverage'].describe().to_string())

acct_summary = (daily_account.groupby('account')
                .agg(total_pnl=('daily_pnl','sum'),
                     total_trades=('trades_count','sum'),
                     avg_win_rate=('win_rate','mean'))
                .reset_index())
print("\nTop 5 accounts by total_pnl:")
print(acct_summary.sort_values('total_pnl', ascending=False).head(5).to_string(index=False))

# ------- Save outputs -------
daily_account.to_csv("outputs/daily_account_metrics.csv", index=False)
daily.to_csv("outputs/daily_metrics.csv",         index=False)
trades.to_csv("outputs/trades_cleaned.csv",        index=False)

print("\nSaved to outputs/: daily_account_metrics.csv, daily_metrics.csv, trades_cleaned.csv")
# END PART A (


Loaded: sentiment rows = 2644 , trades rows = 211224

-- Sentiment (raw) --
shape: (2644, 4)
columns: ['timestamp', 'value', 'classification', 'date']
missing (top 10):
timestamp         0
value             0
classification    0
date              0
duplicate rows: 0
    timestamp  value classification        date
0  1517463000     30           Fear  2018-02-01
1  1517549400     15   Extreme Fear  2018-02-02
2  1517635800     40           Fear  2018-02-03

-- Trades (raw) --
shape: (211224, 16)
columns: ['Account', 'Coin', 'Execution Price', 'Size Tokens', 'Size USD', 'Side', 'Timestamp IST', 'Start Position', 'Direction', 'Closed PnL', 'Transaction Hash', 'Order ID', 'Crossed', 'Fee', 'Trade ID', 'Timestamp']
missing (top 10):
Account            0
Coin               0
Execution Price    0
Size Tokens        0
Size USD           0
Side               0
Timestamp IST      0
Start Position     0
Direction          0
Closed PnL         0
duplicate rows: 0
                                   

In [3]:
# ============================================================
# PART B  Analysis

import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')   # FIX: non-interactive backend (safe for scripts & notebooks)
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

# ------- Config / load -------
DAILY_ACCT = "outputs/daily_account_metrics.csv"
DAILY      = "outputs/daily_metrics.csv"
TRADES     = "outputs/trades_cleaned.csv"

daily_account = pd.read_csv(DAILY_ACCT, parse_dates=['date'])
daily         = pd.read_csv(DAILY,      parse_dates=['date'])
trades        = pd.read_csv(TRADES,     parse_dates=['datetime','date'], low_memory=False)

print("daily_account:", daily_account.shape)
print("daily platform:", daily.shape)
print("trades:", trades.shape)

# ------- Normalize sentiment -------
daily['sentiment'] = daily['sentiment'].astype(str).str.strip().str.capitalize()

# FIX: safe sentiment merge — drop existing sentiment col in daily_account first to avoid duplication
if 'sentiment' in daily_account.columns:
    daily_account.drop(columns=['sentiment'], inplace=True)

sent_map = daily.set_index('date')['sentiment'].to_dict()
daily_account['sentiment'] = daily_account['date'].map(sent_map)

print("Sentiment value counts:")
print(daily_account['sentiment'].value_counts().to_string())

# ============================================================
# 1) Performance: Fear vs Greed days
# ============================================================
metrics = ['daily_pnl', 'win_rate', 'daily_min_cum_pnl']
if 'avg_leverage' in daily_account.columns:
    metrics.append('avg_leverage')

compare_df = daily_account.dropna(subset=['sentiment']).copy()

def compare_metric(metric, df=None):
    if df is None:
        df = compare_df
    # FIX: guard against missing column
    if metric not in df.columns:
        print(f"  [SKIP] Column '{metric}' not found.")
        return {}, np.array([]), np.array([])

    grp   = df.groupby('sentiment')[metric].apply(list).to_dict()
    fear  = np.array([x for x in grp.get('Fear',  []) if pd.notna(x)])
    greed = np.array([x for x in grp.get('Greed', []) if pd.notna(x)])

    summary = {
        'fear_mean':   np.nanmean(fear)   if len(fear)  > 0 else np.nan,
        'greed_mean':  np.nanmean(greed)  if len(greed) > 0 else np.nan,
        'fear_median': np.nanmedian(fear) if len(fear)  > 0 else np.nan,
        'greed_median':np.nanmedian(greed)if len(greed) > 0 else np.nan,
        'n_fear':  len(fear),
        'n_greed': len(greed),
    }

    # FIX: only run tests if both groups have data
    mw_stat = mw_p = t_stat = t_p = np.nan
    if len(fear) > 1 and len(greed) > 1:
        try:
            mw_stat, mw_p = stats.mannwhitneyu(fear, greed, alternative='two-sided')
        except Exception:
            pass
        try:
            t_stat, t_p = stats.ttest_ind(fear, greed, equal_var=False, nan_policy='omit')
        except Exception:
            pass

    summary.update({'mw_stat': mw_stat, 'mw_p': mw_p, 't_stat': t_stat, 't_p': t_p})
    return summary, fear, greed

summaries = {}
for m in metrics:
    s, f, g = compare_metric(m)
    summaries[m] = s
    print(f"\n--- Metric: {m} ---")
    print(s)

# ------- Boxplot: daily_pnl by sentiment -------
fear_pnl  = compare_df.loc[compare_df['sentiment'] == 'Fear',  'daily_pnl'].dropna()
greed_pnl = compare_df.loc[compare_df['sentiment'] == 'Greed', 'daily_pnl'].dropna()

# FIX: only plot if both groups are non-empty
if len(fear_pnl) > 0 and len(greed_pnl) > 0:
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.boxplot([fear_pnl, greed_pnl], labels=['Fear', 'Greed'])
    ax.set_ylabel('daily_pnl (per-account)')
    ax.set_title('Distribution of daily_pnl by Sentiment')
    plt.tight_layout()
    plt.savefig("outputs/boxplot_daily_pnl_by_sentiment.png", dpi=150)
    plt.close()
    print("Saved boxplot_daily_pnl_by_sentiment.png")
else:
    print("[SKIP] Not enough data for daily_pnl boxplot.")

# ------- Boxplot: win_rate by sentiment -------
fear_wr  = compare_df.loc[compare_df['sentiment'] == 'Fear',  'win_rate'].dropna()
greed_wr = compare_df.loc[compare_df['sentiment'] == 'Greed', 'win_rate'].dropna()

if len(fear_wr) > 0 and len(greed_wr) > 0:
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.boxplot([fear_wr, greed_wr], labels=['Fear', 'Greed'])
    ax.set_ylabel('win_rate (daily, per-account)')
    ax.set_title('Win Rate by Sentiment')
    plt.tight_layout()
    plt.savefig("outputs/boxplot_win_rate_by_sentiment.png", dpi=150)
    plt.close()
    print("Saved boxplot_win_rate_by_sentiment.png")
else:
    print("[SKIP] Not enough data for win_rate boxplot.")

# ------- Timeseries: platform PnL + sentiment markers -------
if 'total_pnl' in daily.columns and daily['total_pnl'].notna().sum() > 0:
    fig, ax = plt.subplots(figsize=(12, 4))
    daily_sorted = daily.sort_values('date')
    ax.plot(daily_sorted['date'],
            daily_sorted['total_pnl'].rolling(7, min_periods=1).mean(),
            label='7-day MA total_pnl', color='steelblue')

    fear_days  = daily_sorted[daily_sorted['sentiment'] == 'Fear']
    greed_days = daily_sorted[daily_sorted['sentiment'] == 'Greed']
    if len(fear_days)  > 0:
        ax.scatter(fear_days['date'],  fear_days['total_pnl'],  c='red',   s=12, label='Fear days',  zorder=3)
    if len(greed_days) > 0:
        ax.scatter(greed_days['date'], greed_days['total_pnl'], c='green', s=12, label='Greed days', zorder=3)

    ax.legend(); ax.set_title('Platform total_pnl (7d MA) with Fear/Greed markers')
    plt.tight_layout()
    plt.savefig("outputs/ts_total_pnl_sentiment.png", dpi=150)
    plt.close()
    print("Saved ts_total_pnl_sentiment.png")

# ============================================================
# 2) Behavior change metrics
# ============================================================
behav_daily = daily_account.copy()
behavior_metrics = [m for m in ['trades_count','avg_leverage','avg_trade_size','long_short_ratio']
                    if m in behav_daily.columns]

for m in behavior_metrics:
    s, f, g = compare_metric(m, df=behav_daily)
    summaries[m] = s
    print(f"\n--- Behavior metric: {m} ---")
    print(s)

agg_behav = (behav_daily.groupby('sentiment')
             .agg(mean_trades  =('trades_count', 'mean'),
                  median_trades=('trades_count', 'median'),
                  **({'mean_leverage': ('avg_leverage','mean')} if 'avg_leverage' in behav_daily.columns else {}))
             .reset_index())
agg_behav.to_csv("outputs/agg_behav_by_sentiment.csv", index=False)
print("\nSaved agg_behav_by_sentiment.csv")
print(agg_behav.to_string(index=False))

# ------- Long/Short bias -------
if 'side' in trades.columns:
    # FIX: drop sentiment from trades first to avoid column collision on merge
    if 'sentiment' in trades.columns:
        trades.drop(columns=['sentiment'], inplace=True)

    trades['side_norm'] = trades['side'].astype(str).str.lower().str.strip()
    trades['is_long']   = trades['side_norm'].isin(['buy', 'long'])

    # FIX: merge only date+sentiment, avoid duplicate cols
    trades = trades.merge(
        daily[['date','sentiment']].drop_duplicates(),
        on='date', how='left'
    )
    ls = trades.groupby('sentiment')['is_long'].mean().reset_index(name='pct_long')
    ls.to_csv("outputs/pct_long_by_sentiment.csv", index=False)
    print("Saved pct_long_by_sentiment.csv")
    print(ls.to_string(index=False))
else:
    print("No 'side' column — cannot compute long/short bias.")

# ============================================================
# 3) Account clustering (K-Means, k=3)
# ============================================================
agg_kwargs = dict(
    total_pnl          =('daily_pnl',       'sum'),
    avg_win_rate        =('win_rate',        'mean'),
    avg_trades_per_day  =('trades_count',    'mean'),
    avg_size            =('avg_trade_size',  'mean'),
    days_active         =('date',            'nunique'),
)
if 'avg_leverage' in daily_account.columns:
    agg_kwargs['avg_leverage'] = ('avg_leverage', 'mean')

acct_feat = (daily_account.groupby('account')
             .agg(**agg_kwargs)
             .reset_index()
             .replace([np.inf, -np.inf], np.nan)
             .fillna(0))

# Choose best available clustering features
cluster_features = [f for f in ['avg_leverage','avg_trades_per_day','avg_win_rate']
                    if f in acct_feat.columns]
if len(cluster_features) < 2:
    cluster_features = acct_feat.select_dtypes(include=[np.number]).columns.tolist()[:3]

print("\nClustering on features:", cluster_features)

scaler = StandardScaler()
X      = scaler.fit_transform(acct_feat[cluster_features].values)

k      = min(3, len(acct_feat))   # FIX: guard if fewer than 3 accounts
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
acct_feat['cluster'] = kmeans.fit_predict(X)

cluster_summary = (acct_feat.groupby('cluster')[cluster_features + ['total_pnl','days_active']]
                   .agg(['mean','median','count']))
cluster_summary.to_csv("outputs/cluster_summary.csv")
acct_feat.to_csv("outputs/account_segments.csv", index=False)
print("Saved account_segments.csv and cluster_summary.csv")
print("\nCluster summary (means):")
print(acct_feat.groupby('cluster')[cluster_features + ['total_pnl']].mean().to_string())

# Cluster scatter plot
fig, ax = plt.subplots(figsize=(8, 6))
x_feat = cluster_features[1] if len(cluster_features) > 1 else cluster_features[0]
y_feat = cluster_features[0]
for c in sorted(acct_feat['cluster'].unique()):
    sub = acct_feat[acct_feat['cluster'] == c]
    ax.scatter(sub[x_feat], sub[y_feat], s=20, label=f'Cluster {c}')
ax.set_xlabel(x_feat); ax.set_ylabel(y_feat)
ax.legend(); ax.set_title(f'Clusters ({x_feat} vs {y_feat})')
plt.tight_layout()
plt.savefig("outputs/cluster_scatter.png", dpi=150)
plt.close()
print("Saved cluster_scatter.png")

# ------- Save all summary stats -------
pd.DataFrame.from_dict(summaries, orient='index').to_csv("outputs/sentiment_metric_tests.csv")
print("Saved sentiment_metric_tests.csv")

print("\n✅ PART B COMPLETE — Share your Part C code when ready.")
# END PART B (FIXED)


daily_account: (102, 12)
daily platform: (7, 8)
trades: (211224, 20)
Sentiment value counts:
sentiment
Greed            32
Fear             32
Nan              25
Neutral           8
Extreme greed     5

--- Metric: daily_pnl ---
{'fear_mean': 209372.66220543752, 'greed_mean': 99675.5167305, 'fear_median': 81389.6825155, 'greed_median': 35988.3764365, 'n_fear': 32, 'n_greed': 32, 'mw_stat': 669.0, 'mw_p': 0.035589192227433085, 't_stat': 1.3092672646160282, 't_p': 0.19567563543239797}

--- Metric: win_rate ---
{'fear_mean': 0.4158784540695092, 'greed_mean': 0.37407443523093875, 'fear_median': 0.3939621144568476, 'greed_median': 0.4125989656493423, 'n_fear': 32, 'n_greed': 32, 'mw_stat': 535.0, 'mw_p': 0.7625384725608533, 't_stat': 0.7648625894754894, 't_p': 0.448320925708963}

--- Metric: daily_min_cum_pnl ---
{'fear_mean': -10431.168132499999, 'greed_mean': -10294.545593593753, 'fear_median': 0.0, 'greed_median': 0.0, 'n_fear': 32, 'n_greed': 32, 'mw_stat': 411.0, 'mw_p': 0.13302672146

In [9]:
# ============================================================
# PART C (FIXED v2): Strategy Simulation + Predictive Model
# Fixes over v1:
#   1. Train/test split by ROW INDEX not by date → more training data
#   2. Separated high_leverage and consistent_winners clusters
#   3. Added 14-day rolling features for better signal
#   4. Added SMOTE-style class balancing via class_weight
#   5. Lowered prediction threshold (0.45) to improve class 0 recall
#   6. Added cross-validation score for reliability check
# ============================================================

import os
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score,
                             confusion_matrix, classification_report)
from sklearn.model_selection import cross_val_score, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

# -------------------- Config --------------------
LEVERAGE_CAP      = 3.0
SIZE_INCREASE_PCT = 0.20
LEVERAGE_CLIP_PCT = 0.99
SCALE_MIN, SCALE_MAX = 0.2, 5.0
RANDOM_STATE      = 42
PRED_THRESHOLD    = 0.45   # lower threshold to improve class 0 detection
# ------------------------------------------------

os.makedirs("outputs", exist_ok=True)

def pick(*paths):
    for p in paths:
        if p and os.path.exists(p):
            return p
    return None

TRADES_FILE     = pick("outputs/trades_cleaned.csv",       "outputs/trades.csv")
DAILY_FILE      = pick("outputs/daily_metrics.csv",        "outputs/daily.csv")
DAILY_ACCT_FILE = pick("outputs/daily_account_metrics.csv")
ACCT_SEG_FILE   = pick("outputs/account_segments.csv",     "outputs/account_segments_fixed.csv")

missing = [n for n, f in [("TRADES", TRADES_FILE), ("DAILY", DAILY_FILE),
                            ("DAILY_ACCT", DAILY_ACCT_FILE), ("ACCT_SEG", ACCT_SEG_FILE)] if not f]
if missing:
    raise FileNotFoundError(f"Required output files missing: {missing}. Run Part A & B first.")

print("Files in use:")
for label, path in [("TRADES", TRADES_FILE), ("DAILY", DAILY_FILE),
                     ("DAILY_ACCT", DAILY_ACCT_FILE), ("ACCT_SEG", ACCT_SEG_FILE)]:
    print(f"  {label}: {path}")

# -------------------- Load --------------------
trades        = pd.read_csv(TRADES_FILE,     low_memory=False)
daily         = pd.read_csv(DAILY_FILE,      low_memory=False)
daily_account = pd.read_csv(DAILY_ACCT_FILE, low_memory=False)
acct_seg      = pd.read_csv(ACCT_SEG_FILE,   low_memory=False)

# -------------------- Normalize dates --------------------
def to_date_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            parsed = pd.to_datetime(df[c], errors='coerce')
            if parsed.notna().sum() > 0:
                return parsed.dt.normalize()
    raise ValueError("No parsable date column found among: " + str(candidates))

daily['date']         = to_date_col(daily,         ['date', 'day'])
daily_account['date'] = to_date_col(daily_account, ['date'])

if 'date' in trades.columns:
    trades['date'] = to_date_col(trades, ['date'])
elif 'datetime' in trades.columns:
    trades['date'] = pd.to_datetime(trades['datetime'], errors='coerce').dt.normalize()
else:
    raise ValueError("trades has neither 'date' nor 'datetime' column.")

# -------------------- Validate --------------------
if 'account' not in trades.columns or 'account' not in acct_seg.columns:
    raise ValueError("Missing 'account' column in trades or account_segments.")

# -------------------- Numeric conversions --------------------
trades['closed_pnl'] = pd.to_numeric(
    trades.get('closed_pnl', trades.get('pnl', pd.Series(0, index=trades.index))),
    errors='coerce').fillna(0.0)

size_col = next((c for c in ['size_usd','size','size_tokens','abs_size','notional']
                 if c in trades.columns), None)
lev_col  = next((c for c in ['leverage','lev','margin','leverage_ratio']
                 if c in trades.columns), None)

print("Detected size_col:", size_col, "| leverage_col:", lev_col)

trades['old_leverage'] = pd.to_numeric(trades[lev_col],  errors='coerce') if lev_col  else np.nan
trades['old_size']     = pd.to_numeric(trades[size_col], errors='coerce') if size_col else np.nan

trades.loc[trades['old_leverage'] == 0, 'old_leverage'] = np.nan
trades.loc[trades['old_size']     == 0, 'old_size']     = np.nan

if trades['old_leverage'].notna().sum() > 0:
    lev_cap_val = trades['old_leverage'].quantile(LEVERAGE_CLIP_PCT)
    trades['old_leverage_clipped'] = trades['old_leverage'].clip(lower=1.0, upper=lev_cap_val)
else:
    trades['old_leverage_clipped'] = 1.0

trades['old_size_filled']     = trades['old_size'].fillna(1.0)
trades['old_leverage_filled'] = trades['old_leverage_clipped'].fillna(1.0)

# -------------------- Merge cluster --------------------
acct_seg_small = acct_seg[['account','cluster']].drop_duplicates()
trades = trades.merge(acct_seg_small, on='account', how='left')
trades['cluster'] = trades['cluster'].fillna(-1).astype(int)

# -------------------- Identify clusters --------------------
feat_for_cluster = [f for f in ['avg_leverage','avg_win_rate','avg_trades_per_day']
                    if f in acct_seg.columns]
if not feat_for_cluster:
    feat_for_cluster = acct_seg.select_dtypes(include=[np.number]).columns[:2].tolist()

cluster_means = acct_seg.groupby('cluster')[feat_for_cluster].mean().reset_index()
print("\nCluster means:")
print(cluster_means.to_string(index=False))

# High leverage cluster
lev_sort_col = 'avg_leverage' if 'avg_leverage' in cluster_means.columns else feat_for_cluster[0]
sorted_by_lev = cluster_means.sort_values(lev_sort_col, ascending=False)
high_leverage_cluster = int(sorted_by_lev.iloc[0]['cluster'])

# FIX: consistent winners must be a DIFFERENT cluster from high_leverage
if 'avg_win_rate' in cluster_means.columns:
    sorted_by_wr = cluster_means.sort_values('avg_win_rate', ascending=False)
    for _, row in sorted_by_wr.iterrows():
        if int(row['cluster']) != high_leverage_cluster:
            consistent_winners_cluster = int(row['cluster'])
            break
    else:
        consistent_winners_cluster = int(sorted_by_wr.iloc[0]['cluster'])
elif 'total_pnl' in acct_seg.columns:
    pnl_means = acct_seg.groupby('cluster')['total_pnl'].mean().reset_index().sort_values('total_pnl', ascending=False)
    for _, row in pnl_means.iterrows():
        if int(row['cluster']) != high_leverage_cluster:
            consistent_winners_cluster = int(row['cluster'])
            break
else:
    all_clusters = cluster_means['cluster'].tolist()
    consistent_winners_cluster = next((c for c in all_clusters if c != high_leverage_cluster),
                                       high_leverage_cluster)

print(f"\nClusters → high_leverage: {high_leverage_cluster} | consistent_winners: {consistent_winners_cluster}")

# -------------------- Map sentiment --------------------
if 'sentiment' in trades.columns:
    trades.drop(columns=['sentiment'], inplace=True)

sent_map = daily.set_index('date')['sentiment'].to_dict()
trades['date_norm'] = pd.to_datetime(trades['date']).dt.normalize()
trades['sentiment'] = trades['date_norm'].map(sent_map).ffill().bfill()

# -------------------- Apply strategy rules --------------------
mask_r1 = ((trades['cluster'] == high_leverage_cluster) &
            (trades['sentiment'].str.lower() == 'fear'))
trades['new_leverage'] = trades['old_leverage_filled'].copy()
trades.loc[mask_r1, 'new_leverage'] = np.minimum(
    trades.loc[mask_r1, 'old_leverage_filled'], LEVERAGE_CAP)

acct_win = (daily_account.groupby('account')['win_rate']
            .mean().reset_index().rename(columns={'win_rate': 'acct_win_rate'}))
trades = trades.merge(acct_win, on='account', how='left')

mask_r2 = ((trades['cluster'] == consistent_winners_cluster) &
            (trades['sentiment'].str.lower() == 'greed') &
            (trades['acct_win_rate'] > 0.5))
trades['new_size'] = trades['old_size_filled'].copy()
trades.loc[mask_r2, 'new_size'] = trades.loc[mask_r2, 'old_size_filled'] * (1 + SIZE_INCREASE_PCT)

trades['new_leverage'] = trades['new_leverage'].fillna(trades['old_leverage_filled'])
trades['new_size']     = trades['new_size'].fillna(trades['old_size_filled'])

# -------------------- Scaling + adjusted PnL --------------------
epsilon = 1e-9
trades['scale_raw']      = ((trades['new_leverage'] * trades['new_size']) /
                             (trades['old_leverage_filled'] * trades['old_size_filled'] + epsilon))
trades['scale']          = trades['scale_raw'].clip(lower=SCALE_MIN, upper=SCALE_MAX)
trades['closed_pnl_adj'] = trades['closed_pnl'] * trades['scale']

# -------------------- Aggregate results --------------------
orig_daily = trades.groupby('date_norm')['closed_pnl'].sum().reset_index(name='orig_total_pnl')
adj_daily  = trades.groupby('date_norm')['closed_pnl_adj'].sum().reset_index(name='adj_total_pnl')
compare_daily = (orig_daily.merge(adj_daily, on='date_norm', how='outer')
                 .fillna(0).sort_values('date_norm').reset_index(drop=True))
compare_daily['delta'] = compare_daily['adj_total_pnl'] - compare_daily['orig_total_pnl']

tot_orig  = compare_daily['orig_total_pnl'].sum()
tot_adj   = compare_daily['adj_total_pnl'].sum()
delta_tot = tot_adj - tot_orig

orig_acct = trades.groupby('account')['closed_pnl'].sum().reset_index(name='orig_pnl')
adj_acct  = trades.groupby('account')['closed_pnl_adj'].sum().reset_index(name='adj_pnl')
acct_compare = (orig_acct.merge(adj_acct, on='account', how='outer')
                .fillna(0).assign(delta=lambda d: d['adj_pnl'] - d['orig_pnl'])
                .sort_values('delta'))

orig_vol = trades.groupby('date_norm')['closed_pnl'].sum().std()
adj_vol  = trades.groupby('date_norm')['closed_pnl_adj'].sum().std()

compare_daily.to_csv("outputs/strategy_simulation_daily_impact_partc_final.csv", index=False)
trades.to_csv("outputs/trades_sim_full_partc_final.csv", index=False)
acct_compare.to_csv("outputs/account_impact_partc_final.csv", index=False)

print("\nSIMULATION SUMMARY")
print("------------------")
print(f"Total orig pnl : {tot_orig:,.2f}")
print(f"Total adj  pnl : {tot_adj:,.2f}")
print(f"Delta          : {delta_tot:,.2f}")
print(f"Orig daily vol : {orig_vol:,.2f}")
print(f"Adj  daily vol : {adj_vol:,.2f}")
print("\nTop 5 accounts HURT:")
print(acct_compare.head(5).to_string(index=False))
print("\nTop 5 accounts HELPED:")
print(acct_compare.tail(5).to_string(index=False))

# Cumulative PnL plot
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(compare_daily['date_norm'], compare_daily['orig_total_pnl'].cumsum(), label='Original')
ax.plot(compare_daily['date_norm'], compare_daily['adj_total_pnl'].cumsum(),  label='Adjusted')
ax.set_title('Cumulative PnL: Original vs Strategy-Adjusted')
ax.legend(); plt.tight_layout()
plt.savefig("outputs/simulation_cumpnl_partc_final.png", dpi=150)
plt.close()

# ============================================================
# Predictive Model — FIXED
# ============================================================
df = daily_account.copy()
df['date'] = pd.to_datetime(df['date']).dt.normalize()
df = df.sort_values(['account','date']).reset_index(drop=True)

df['next_daily_pnl']         = df.groupby('account')['daily_pnl'].shift(-1)
df['target_next_profitable'] = (df['next_daily_pnl'] > 0).astype(int)

if 'sentiment' in df.columns:
    df.drop(columns=['sentiment'], inplace=True)
df = df.merge(daily[['date','sentiment']].drop_duplicates(), on='date', how='left')
df['sentiment_num'] = df['sentiment'].map({'Fear': -1, 'Greed': 1}).fillna(0)

for col in ['trades_count','avg_leverage','win_rate','avg_trade_size']:
    df[col] = pd.to_numeric(df.get(col, 0), errors='coerce').fillna(0)

# FIX: added 14-day rolling features for more signal
for feat, src in [('rol_trades_7','trades_count'),   ('rol_leverage_7','avg_leverage'),
                   ('rol_winrate_7','win_rate'),       ('rol_size_7','avg_trade_size'),
                   ('rol_trades_14','trades_count'),   ('rol_leverage_14','avg_leverage'),
                   ('rol_winrate_14','win_rate'),      ('rol_pnl_7','daily_pnl')]:
    window = 14 if '14' in feat else 7
    df[feat] = df.groupby('account')[src].transform(
        lambda s: s.rolling(window, min_periods=1).mean())

feat_cols = ['sentiment_num','rol_trades_7','rol_leverage_7','rol_winrate_7',
             'rol_size_7','rol_trades_14','rol_leverage_14','rol_winrate_14','rol_pnl_7']

df_model = df.dropna(subset=feat_cols + ['target_next_profitable']).copy().reset_index(drop=True)
print(f"\nModel dataset size: {df_model.shape[0]} rows")
print("Class distribution:", df_model['target_next_profitable'].value_counts().to_dict())

if df_model.shape[0] < 20:
    print("WARNING: dataset too small. Skipping model.")
else:
    # FIX: split by ROW INDEX (80/20) not by date → guarantees enough training rows
    split_idx  = int(len(df_model) * 0.80)
    X_train    = df_model.loc[:split_idx-1, feat_cols].values
    y_train    = df_model.loc[:split_idx-1, 'target_next_profitable'].values
    X_test     = df_model.loc[split_idx:,   feat_cols].values
    y_test     = df_model.loc[split_idx:,   'target_next_profitable'].values

    print(f"Train rows: {len(X_train)} | Test rows: {len(X_test)}")

    if len(X_train) == 0 or len(X_test) == 0:
        print("WARNING: split produced empty set. Skipping model.")
    else:
        scaler    = StandardScaler()
        X_train_s = scaler.fit_transform(np.nan_to_num(X_train))
        X_test_s  = scaler.transform(np.nan_to_num(X_test))

        clf = RandomForestClassifier(
            n_estimators=300,
            max_depth=6,
            min_samples_leaf=3,
            random_state=RANDOM_STATE,
            class_weight='balanced',
            n_jobs=-1)
        clf.fit(X_train_s, y_train)

        # FIX: use threshold 0.45 instead of 0.5 to improve class 0 recall
        y_proba = clf.predict_proba(X_test_s)[:, 1]
        y_pred  = (y_proba >= PRED_THRESHOLD).astype(int)

        acc = accuracy_score(y_test, y_pred)
        f1  = f1_score(y_test, y_pred, zero_division=0)
        auc = roc_auc_score(y_test, y_proba) if len(np.unique(y_test)) > 1 else float('nan')
        cm  = confusion_matrix(y_test, y_pred)

        # Cross-validation on full dataset
        cv = StratifiedKFold(n_splits=min(5, len(df_model)//10 or 2), shuffle=False)
        cv_scores = cross_val_score(clf, scaler.fit_transform(np.nan_to_num(df_model[feat_cols].values)),
                                    df_model['target_next_profitable'].values,
                                    cv=cv, scoring='roc_auc')

        print("\nPREDICTIVE MODEL SUMMARY")
        print("------------------------")
        print(f"Train rows : {len(X_train)} | Test rows: {len(X_test)}")
        print(f"Accuracy   : {acc:.4f}")
        print(f"F1 Score   : {f1:.4f}")
        print(f"AUC        : {auc:.4f}" if not np.isnan(auc) else "AUC: n/a")
        print(f"CV AUC     : {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
        print("Confusion matrix:\n", cm)
        print("\nClassification report:\n",
              classification_report(y_test, y_pred, zero_division=0))

        feat_imp = (pd.DataFrame({'feature': feat_cols, 'importance': clf.feature_importances_})
                    .sort_values('importance', ascending=False))
        feat_imp.to_csv("outputs/model_feature_importances_partc_final.csv", index=False)
        df_model[['account','date'] + feat_cols + ['target_next_profitable']].to_csv(
            "outputs/model_dataset_partc_final.csv", index=False)

        with open("outputs/model_report_partc_final.txt", "w") as fh:
            fh.write(f"Accuracy : {acc:.4f}\nF1: {f1:.4f}\nAUC: {auc}\n")
            fh.write(f"CV AUC   : {cv_scores.mean():.4f} +/- {cv_scores.std():.4f}\n")
            fh.write("Confusion matrix:\n" + np.array2string(cm))
            fh.write("\n\nClassification report:\n")
            fh.write(classification_report(y_test, y_pred, zero_division=0))

        fig, ax = plt.subplots(figsize=(8, 5))
        ax.barh(feat_imp['feature'], feat_imp['importance'], color='steelblue')
        ax.set_xlabel('Importance'); ax.set_title('Feature Importances')
        ax.invert_yaxis(); plt.tight_layout()
        plt.savefig("outputs/model_feature_importances_partc_final.png", dpi=150)
        plt.close()
        print("Saved model_feature_importances_partc_final.png")

# -------------------- Summary --------------------
pd.DataFrame({
    'total_orig_pnl': [tot_orig], 'total_adj_pnl': [tot_adj],
    'delta_total': [delta_tot], 'orig_daily_vol': [orig_vol], 'adj_daily_vol': [adj_vol],
    'high_leverage_cluster': [high_leverage_cluster],
    'consistent_winners_cluster': [consistent_winners_cluster],
    'leverage_cap_used': [LEVERAGE_CAP], 'size_increase_pct': [SIZE_INCREASE_PCT],
}).to_csv("outputs/strategy_simulation_summary_partc_final.csv", index=False)

print("\nAll outputs saved to outputs/")


Files in use:
  TRADES: outputs/trades_cleaned.csv
  DAILY: outputs/daily_metrics.csv
  DAILY_ACCT: outputs/daily_account_metrics.csv
  ACCT_SEG: outputs/account_segments.csv
Detected size_col: size_usd | leverage_col: leverage

Cluster means:
 cluster   avg_leverage  avg_win_rate  avg_trades_per_day
       0   24145.189812      0.383982         1532.886139
       1   28664.002767      0.586406        11995.333333
       2 -445764.113390      0.276284         2641.583333

Clusters → high_leverage: 1 | consistent_winners: 0

SIMULATION SUMMARY
------------------
Total orig pnl : 10,296,958.94
Total adj  pnl : 9,496,464.51
Delta          : -800,494.44
Orig daily vol : 2,581,231.98
Adj  daily vol : 2,313,607.15

Top 5 accounts HURT:
                                   account      orig_pnl       adj_pnl         delta
0xbaaaf6571ab7d571043ff1e313a9609a10637864  9.401638e+05  1.880380e+05 -7.521258e+05
0xbee1707d6b44d4d52bfe19e41f8a828645437aab  8.360806e+05  7.697635e+05 -6.631705e+04
0xb12