In [1]:
import pandas as pd
import numpy as np
import os
import sys
import talib
import joblib
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
warnings.filterwarnings('ignore')

In [2]:
from binance.client import Client
sys.path.append(os.path.abspath(".."))  # root /PycharmProjects/MMAT
from config.load_env import load_keys

keys = load_keys()
#print("Loaded keys:", keys)
client = Client(keys['api_key'], keys['secret_key'])

In [3]:
# ================== Enhanced Feature Engineering ==================
class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, lookback_periods=[5, 10, 20, 50]):
        self.lookback_periods = lookback_periods

    def fit(self, X, y=None):
        return self

    def transform(self, df):
        # Ensure index is datetime
        if not isinstance(df.index, pd.DatetimeIndex):
            df.index = pd.to_datetime(df.index)

        # Basic price features
        df['price_change'] = df['close'].pct_change()
        df['high_low_ratio'] = (df['high'] - df['low']) / df['close']
        df['close_open_ratio'] = (df['close'] - df['open']) / df['open']

        # Multi-timeframe features (e.g., 15min, 30min, 1h)
        for period in [15, 30, 60]:
            df[f'ema_{period}'] = talib.EMA(df['close'], timeperiod=period)
            df[f'sma_{period}'] = talib.SMA(df['close'], timeperiod=period)
            df[f'ma_cross_{period}'] = np.where(df[f'ema_{period}'] > df[f'sma_{period}'], 1, -1)

        # Volatility features
        df['atr'] = talib.ATR(df['high'], df['low'], df['close'], timeperiod=14)
        df['natr'] = talib.NATR(df['high'], df['low'], df['close'], timeperiod=14)
        df['volatility'] = df['close'].rolling(20).std() / df['close'].rolling(20).mean()

        # Volume features
        df['volume_ma'] = df['volume'].rolling(20).mean()
        df['volume_ratio'] = df['volume'] / df['volume_ma']
        df['obv'] = talib.OBV(df['close'], df['volume'])

        # Momentum features
        df['rsi'] = talib.RSI(df['close'], timeperiod=14)
        df['macd'], df['macd_signal'], _ = talib.MACD(df['close'], fastperiod=12, slowperiod=26, signalperiod=9)
        df['macd_hist'] = df['macd'] - df['macd_signal']
        df['stoch_k'], df['stoch_d'] = talib.STOCH(
            df['high'], df['low'], df['close'], fastk_period=14, slowk_period=3, slowd_period=3
        )

        # Advanced indicators
        df['adx'] = talib.ADX(df['high'], df['low'], df['close'], timeperiod=14)
        df['cci'] = talib.CCI(df['high'], df['low'], df['close'], timeperiod=20)
        df['mfi'] = talib.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=14)

        # Price pattern features
        df['inside_bar'] = ((df['high'] < df['high'].shift(1)) & (df['low'] > df['low'].shift(1))).astype(int)
        df['outside_bar'] = ((df['high'] > df['high'].shift(1)) & (df['low'] < df['low'].shift(1))).astype(int)

        # Time features
        df['hour'] = df.index.hour
        df['day_of_week'] = df.index.dayofweek
        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

        # Lag features
        for lag in [1, 2, 3, 5]:
            df[f'return_lag{lag}'] = df['price_change'].shift(lag)
            df[f'volume_ratio_lag{lag}'] = df['volume_ratio'].shift(lag)

        # Target encoding features (omitted here to avoid lookahead bias in real applications)

        # Remove rows with missing values
        df = df.dropna()

        return df

# ================== Meta-Learning Model Architecture ==================
# class MetaModel:
#     def __init__(self):
#         # Define base learners
#         self.base_models = [
#             ('xgb_base', XGBClassifier(
#                 n_estimators=300,
#                 max_depth=5,
#                 learning_rate=0.05,
#                 subsample=0.8,
#                 colsample_bytree=0.8,
#                 gamma=0.1,
#                 reg_alpha=0.1,
#                 reg_lambda=0.1,
#                 objective='multi:softprob',
#                 num_class=3,
#                 use_label_encoder=False,
#                 eval_metric='mlogloss'
#             )),
#             ('rf', RandomForestClassifier(
#                 n_estimators=200,
#                 max_depth=7,
#                 class_weight='balanced',
#                 random_state=42
#             )),
#             ('gbm', GradientBoostingClassifier(
#                 n_estimators=150,
#                 learning_rate=0.05,
#                 max_depth=4,
#                 random_state=42
#             ))
#         ]
#
#         # Meta-level learner (stacking)
#         self.meta_model = XGBClassifier(
#             n_estimators=100,
#             max_depth=3,
#             learning_rate=0.1,
#             objective='multi:softprob',
#             num_class=3,
#             use_label_encoder=False,
#             eval_metric='mlogloss'
#         )
#         self.scaler = StandardScaler()
#
#     def fit(self, X, y):
#         sample_weights = compute_sample_weight(class_weight='balanced', y=y)
#         base_preds = []
#
#         # Train base models and collect predictions
#         for name, model in self.base_models:
#             model.fit(X, y, sample_weight=sample_weights)
#             preds = model.predict_proba(X)
#             base_preds.append(preds)
#
#         # Combine predictions as meta-features
#         meta_X = np.hstack(base_preds)
#         meta_X_scaled = self.scaler.fit_transform(meta_X)
#
#         # Train meta-model
#         self.meta_model.fit(meta_X_scaled, y, sample_weight=sample_weights)
#         self.base_models = [(name, model) for name, model in self.base_models]
#
#     def predict_proba(self, X):
#         base_preds = [model.predict_proba(X) for _, model in self.base_models]
#         meta_X = np.hstack(base_preds)
#         meta_X_scaled = self.scaler.transform(meta_X)
#         return self.meta_model.predict_proba(meta_X_scaled)
#
#     def predict(self, X):
#         proba = self.predict_proba(X)
#         # Penalize neutral class, encourage directional signals
#         adjusted = proba * np.array([1.2, 0.9, 1.2])  # [down, neutral, up]
#         return np.argmax(adjusted, axis=1)
#
# # ================== Class Balancing ==================
# def balance_classes(df, label_col='label'):
#     from sklearn.utils import resample
#
#     # Split by class
#     df_neg = df[df[label_col] == -1]
#     df_zero = df[df[label_col] == 0]
#     df_pos = df[df[label_col] == 1]
#
#     # Upsample all to the size of the largest class
#     max_len = max(len(df_neg), len(df_zero), len(df_pos))
#     df_neg_up = resample(df_neg, replace=True, n_samples=max_len, random_state=42)
#     df_zero_up = resample(df_zero, replace=True, n_samples=max_len, random_state=42)
#     df_pos_up = resample(df_pos, replace=True, n_samples=max_len, random_state=42)
#
#     # Combine and shuffle
#     df_balanced = pd.concat([df_neg_up, df_zero_up, df_pos_up]).sample(frac=1, random_state=42)
#     return df_balanced


In [None]:
import os, webbrowser
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_pattern_results(df, patterns, symbol,
                         max_points=2000, buffer=50,
                         open_browser=True):
    # 1) slice out the window you care about
    start = max(len(df) - max_points - buffer, 0)
    dfp = df.iloc[start:].copy()

    # 2) ensure output dir
    out = '../plots/'
    os.makedirs(out, exist_ok=True)

    # 3) each pattern → one chart
    for name in patterns:
        sig = f"Signal_{name}"
        if sig not in dfp.columns:
            print(f"no column {sig}, skipping")
            continue

        up   = dfp[dfp[sig] == 1]
        down = dfp[dfp[sig] == -1]

        fig = make_subplots(
            rows=2, cols=1, shared_xaxes=True,
            row_heights=[0.7,0.3], vertical_spacing=0.05,
            subplot_titles=['Price + Signals','RSI']
        )

        # candles
        fig.add_trace(go.Candlestick(
            x=dfp.index, open=dfp['open'], high=dfp['high'],
            low=dfp['low'], close=dfp['close'], name='Candles'
        ), row=1, col=1)

        # MAs
        fig.add_trace(go.Scatter(
            x=dfp.index, y=dfp['MA20'], mode='lines', name='MA20'
        ), row=1, col=1)
        fig.add_trace(go.Scatter(
            x=dfp.index, y=dfp['MA50'], mode='lines', name='MA50'
        ), row=1, col=1)

        # bullish
        if not up.empty:
            fig.add_trace(go.Scatter(
                x=up.index, y=up['close'] * 1.005,
                mode='markers', name='Bullish',
                marker=dict(symbol='triangle-up', color='green', size=10)
            ), row=1, col=1)

        # bearish
        if not down.empty:
            fig.add_trace(go.Scatter(
                x=down.index, y=down['close'] * 0.995,
                mode='markers', name='Bearish',
                marker=dict(symbol='triangle-down', color='red', size=10)
            ), row=1, col=1)

        # RSI subplot
        fig.add_trace(go.Scatter(
            x=dfp.index, y=dfp['RSI'], mode='lines', name='RSI'
        ), row=2, col=1)
        fig.add_hline(y=50, line_dash='dash', line_color='gray', row=2, col=1)

        fig.update_layout(
            title=f"{symbol} — {name} Signals",
            xaxis_rangeslider_visible=False,
            template='plotly_white',
            height=600
        )

        fn = os.path.join(out, f"{name}_signals.html")
        fig.write_html(fn)
        print(f"Saved {fn}")
        if open_browser:
            webbrowser.open('file://' + os.path.abspath(fn))

        # clear the figure before next
        fig.data = []


In [6]:
# ================== signal_prediction_pipeline_updated.py ==================
# Integrated quant signal prediction pipeline including:
# 1. Quantile-based labeling (calculate_target_quantile)
# 2. Class balancing (SMOTE)
# 3. Block Bootstrap for White’s Reality Check (block_bootstrap_pval)
# 4. Weighted performance evaluation (weighted_signal_evaluation)
# 5. Candlestick pattern features (integrate_candlestick_features)

import pandas as pd
import numpy as np
import os
import sys
import talib
import joblib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

# === Quantile-based labeling ===
def calculate_target_quantile(df, future_bars=3, lower_q=0.2, upper_q=0.8):
    future_return = df['close'].shift(-future_bars) / df['close'] - 1
    lower = future_return.quantile(lower_q)
    upper = future_return.quantile(upper_q)
    conds = [
        future_return <= lower,
        (future_return > lower) & (future_return < upper),
        future_return >= upper
    ]
    labels = [-1, 0, 1]
    direction = np.select(conds, labels, default=0)
    magnitude = future_return.abs()
    return pd.DataFrame({'direction': direction, 'magnitude': magnitude}, index=df.index)

# === Simple oversampling (alternative to SMOTE) ===
def simple_oversample(X, y):
    df_train = X.copy()
    df_train['direction'] = y.values
    majority = df_train[df_train['direction'] == 0]
    minority = df_train[df_train['direction'] != 0]
    minority_upsampled = resample(
        minority,
        replace=True,
        n_samples=len(majority),
        random_state=42
    )
    df_bal = pd.concat([majority, minority_upsampled])
    y_bal = df_bal['direction']
    X_bal = df_bal.drop(columns='direction')
    return X_bal, y_bal

# === Candlestick pattern features ===
def calculate_patterns(df):
    funcs = {
        'Hammer': talib.CDLHAMMER,
        'InvertedHammer': talib.CDLINVERTEDHAMMER,
        'BullishEngulfing': lambda o,h,l,c: np.where(talib.CDLENGULFING(o,h,l,c)==100,100,0),
        'BearishEngulfing': lambda o,h,l,c: np.where(talib.CDLENGULFING(o,h,l,c)==-100,-100,0),
        'PiercingLine': talib.CDLPIERCING,
        'DarkCloudCover': talib.CDLDARKCLOUDCOVER,
        'MorningStar': talib.CDLMORNINGSTAR,
        'EveningStar': talib.CDLEVENINGSTAR,
        'ThreeWhiteSoldiers': talib.CDL3WHITESOLDIERS,
        'ThreeBlackCrows': talib.CDL3BLACKCROWS,
        # Add more patterns as needed
    }
    for name, fn in funcs.items():
        df[name] = fn(df['open'].values, df['high'].values, df['low'].values, df['close'].values)
    return df

def aggregate_candlestick_signals(df):
    bullish = ['Hammer','InvertedHammer','BullishEngulfing','PiercingLine','MorningStar','ThreeWhiteSoldiers']
    bearish = ['BearishEngulfing','DarkCloudCover','EveningStar','ThreeBlackCrows']
    df['bullish_score'] = df[bullish].eq(100).sum(axis=1)
    df['bearish_score'] = df[bearish].eq(-100).sum(axis=1)
    df['candlestick_score'] = df['bullish_score'] - df['bearish_score']
    return df

def integrate_candlestick_features(df):
    df = calculate_patterns(df)
    df = aggregate_candlestick_signals(df)
    return df.dropna()

# === Feature engineering ===
class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, df):
        df = df.copy()
        if not isinstance(df.index, pd.DatetimeIndex):
            df.index = pd.to_datetime(df.index)
        df['price_change'] = df['close'].pct_change()
        df['high_low_ratio'] = (df['high'] - df['low']) / df['close']
        df['close_open_ratio'] = (df['close'] - df['open']) / df['open']
        df['rsi'] = talib.RSI(df['close'])
        df['macd'], df['macd_signal'], _ = talib.MACD(df['close'])
        df['volume_ma'] = df['volume'].rolling(20).mean()
        df['volume_ratio'] = df['volume'] / df['volume_ma']
        df['price_volume_corr'] = df['close'].rolling(20).corr(df['volume'])
        df['rsi_divergence'] = df['close'] - df['rsi']
        return df.dropna()

# === Enhanced MetaModel with PCA + ElasticNet ===
class EnhancedMetaModel:
    def __init__(self, n_pca=20, l1_ratio=0.5):
        self.pca = PCA(n_components=n_pca)
        self.base_models = [
            ('xgb', XGBClassifier(
                n_estimators=100,
                reg_alpha=1.0,
                reg_lambda=1.0,
                use_label_encoder=False,
                eval_metric='mlogloss'
            )),
            ('rf', RandomForestClassifier(
                n_estimators=100,
                max_depth=8,
                class_weight='balanced'
            )),
            ('gbm', GradientBoostingClassifier(
                n_estimators=100,
                learning_rate=0.1
            ))
        ]
        self.meta_model = Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=n_pca)),
            ('clf', LogisticRegression(
                penalty='elasticnet',
                solver='saga',
                l1_ratio=l1_ratio,
                C=1.0,
                max_iter=1000,
                class_weight='balanced'
            ))
        ])
        self.scaler = StandardScaler()
        self.encoder = LabelEncoder()

    def fit(self, X, y):
        X_pca = self.pca.fit_transform(self.scaler.fit_transform(X))
        y_enc = self.encoder.fit_transform(y)
        meta_X = []
        for name, model in self.base_models:
            model.fit(X_pca, y_enc)
            meta_X.append(model.predict_proba(X_pca))
        meta_X = np.hstack(meta_X)
        self.meta_model.fit(X, y_enc)
        return self

    def predict(self, X):
        X_pca = self.pca.transform(self.scaler.transform(X))
        meta_X = np.hstack([m.predict_proba(X_pca) for _, m in self.base_models])
        preds = self.meta_model.predict(X)
        return self.encoder.inverse_transform(preds)

# === Block Bootstrap p-value for White Reality Check ===
def block_bootstrap_pval(returns, B=1000, block_len=5, seed=42):
    np.random.seed(seed)
    n = len(returns)
    indices = np.arange(n)
    boot_means = []
    for _ in range(B):
        sample_idx = []
        nb = int(np.ceil(n / block_len))
        for _ in range(nb):
            start = np.random.randint(0, n - block_len + 1)
            sample_idx.extend(indices[start: start + block_len])
        sample_idx = sample_idx[:n]
        boot_means.append(returns[sample_idx].mean())
    d_bar = returns.mean()
    return np.mean([d_bar <= m for m in boot_means])

# === Weighted signal evaluation ===
def weighted_signal_evaluation(pred_list, y_list, sharpe_list):
    weights = np.clip(sharpe_list, 0, None)
    weights = weights / weights.sum() if weights.sum() > 0 else np.ones_like(weights) / len(weights)
    combined = sum(w * (np.sign(y) * 0.001 * np.sign(p))
                   for w, p, y in zip(weights, pred_list, y_list))
    sharpe = combined.mean() / combined.std() * np.sqrt(252*24*4) if combined.std() else 0
    acc = accuracy_score(np.concatenate(y_list), np.concatenate(pred_list))
    return {'accuracy': acc, 'sharpe': sharpe}

# === Main execution pipeline ===
def run_pipeline(csv_path):
    df = pd.read_csv(csv_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.set_index('timestamp', inplace=True)
    df = df.resample('15min').agg({'open':'first','high':'max','low':'min','close':'last','volume':'sum'}).dropna()

    # 1. Generate target labels
    target_df = calculate_target_quantile(df, future_bars=3, lower_q=0.2, upper_q=0.8)
    df['direction'] = target_df['direction']
    df['magnitude'] = target_df['magnitude']
    df = df.dropna(subset=['direction'])
    print("Label distribution:\n", df['direction'].value_counts(normalize=True))

    # 2. Feature engineering
    df_feat = AdvancedFeatureEngineer().fit_transform(df)
    df_feat = integrate_candlestick_features(df_feat)

    X = df_feat.drop(columns=['open','high','low','close','volume','direction','magnitude'])
    y = df_feat['direction']

    results, pred_list, y_list, sharpe_list = [], [], [], []
    train_len = 5000
    test_len  = 1000
    step      = 1000

    n = len(X)
    for start in range(0, n - train_len - test_len + 1, step):
        train_idx = range(start, start + train_len)
        test_idx  = range(start + train_len, start + train_len + test_len)

        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test,  y_test  = X.iloc[test_idx],  y.iloc[test_idx]

        if y_train.nunique() < 2:
            print(f"Window {start}-{start+train_len+test_len} skipped: only {y_train.unique()}")
            continue

        X_res, y_res = SMOTE().fit_resample(X_train, y_train)
        model = EnhancedMetaModel().fit(X_res, y_res)
        preds = model.predict(X_test)

        ret = df['close'].iloc[test_idx].pct_change().shift(-1) * np.sign(preds)
        sharpe = ret.mean()/ret.std()*np.sqrt(252*24*4) if ret.std() else 0
        sharpe_list.append(sharpe)

        print(f"\n======= Window {start}-{start+train_len-1} → Test {start+train_len}-{start+train_len+test_len-1} =======")
        print(classification_report(y_test, preds, digits=3))
        print(f"Sharpe: {sharpe:.2f} | Acc: {accuracy_score(y_test, preds):.4f}")

        results.append({'accuracy': accuracy_score(y_test, preds), 'sharpe': sharpe})
        pred_list.append(preds)
        y_list.append(y_test.values)

    if not results:
        print("❌ No valid folds")
        return

    print("\n======= Summary =======")
    for i, r in enumerate(results, 1):
        print(f"Fold {i}: Acc={r['accuracy']:.4f}, Sharpe={r['sharpe']:.2f}")

    all_ret = np.concatenate([np.sign(y)*0.001*np.sign(p) for p, y in zip(pred_list, y_list)])
    print("White Reality Check p-value:", block_bootstrap_pval(all_ret))

    w = weighted_signal_evaluation(pred_list, y_list, np.array(sharpe_list))
    print(f"Weighted Acc={w['accuracy']:.4f}, Sharpe={w['sharpe']:.2f}")

    joblib.dump(EnhancedMetaModel().fit(X, y), "final_model.pkl")
    print("Saved final_model.pkl")

    df_feat = AdvancedFeatureEngineer().fit_transform(df)
    df_feat = integrate_candlestick_features(df_feat)
    df_plot = df_feat.copy()
    df_plot['MA20'] = df_plot['close'].rolling(20).mean()
    df_plot['MA50'] = df_plot['close'].rolling(50).mean()
    df_plot['RSI']  = talib.RSI(df_plot['close'])

    patterns = ['BearishEngulfing','ThreeWhiteSoldiers','InvertedHammer']
    for pat in patterns:
        df_plot[f"Signal_{pat}"] = np.where(
            df_plot[pat]==100,  1,
            np.where(df_plot[pat]==-100, -1, 0)
        )

    plot_pattern_results(
        df=df_plot,
        patterns=patterns,
        symbol='BTCUSDT',
        max_points=2000,
        buffer=50,
        open_browser=True
    )


In [7]:
csv_path = "../../data/BTCUSDT_1min_2024-05-01_to_2025-05-01.csv"
run_pipeline(csv_path)

Label distribution:
 direction
 0    0.600023
-1    0.199989
 1    0.199989
Name: proportion, dtype: float64

              precision    recall  f1-score   support

          -1      0.378     0.115     0.176       148
           0      0.813     0.769     0.790       731
           1      0.212     0.463     0.291       121

    accuracy                          0.635      1000
   macro avg      0.468     0.449     0.419      1000
weighted avg      0.676     0.635     0.639      1000

Sharpe: -3.05 | Acc: 0.6350

              precision    recall  f1-score   support

          -1      0.284     0.216     0.246       259
           0      0.608     0.396     0.479       513
           1      0.284     0.583     0.382       228

    accuracy                          0.392      1000
   macro avg      0.392     0.398     0.369      1000
weighted avg      0.450     0.392     0.397      1000

Sharpe: 6.72 | Acc: 0.3920

              precision    recall  f1-score   support

          -1    