In [1]:
from binance.client import Client
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
from ta.trend import MACD
from ta.momentum import RSIIndicator, StochasticOscillator
from ta.volatility import BollingerBands
from ta.trend import SMAIndicator
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import (accuracy_score, precision_score, 
                           recall_score, f1_score, confusion_matrix,
                           precision_recall_curve)
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# ======================
# PARAMETERS
# ======================
THRESHOLD_PCT = np.arange(0.1, 3, 0.1)  # Percentage decrease thresholds to evaluate
SYMBOL = "BTCUSDT"
START_DATE = "1 Jan 2012"
TEST_SIZE = 0.4  # Holdout set size

# ======================
# DATA FETCHING
# ======================
client = Client(api_key='your_api_key', api_secret='your_api_secret')

def fetch_daily_data(symbol, start_str, end_date=None):
    """Fetch raw daily candlestick data from Binance"""
    if end_date is None:
        end_date = datetime.now()
    else:
        end_date = pd.to_datetime(end_date)
    
    start_ts = int(pd.to_datetime(start_str).timestamp() * 1000)
    all_data = []
    
    while True:
        candles = client.get_klines(
            symbol=symbol,
            interval=Client.KLINE_INTERVAL_1DAY,
            startTime=start_ts,
            limit=1000
        )
        
        if not candles:
            break
            
        df = pd.DataFrame(candles, columns=[
            'timestamp', 'open', 'high', 'low', 'close', 'volume',
            'close_time', 'quote_asset_volume', 'trades',
            'taker_buy_base', 'taker_buy_quote', 'ignore'
        ])
        
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
        df.set_index('timestamp', inplace=True)
        df = df[['open', 'high', 'low', 'close', 'volume']].astype(float)
        all_data.append(df)
        
        # Get the last timestamp in the dataframe
        last_timestamp = df.index[-1]
        
        # Check if we've reached the end date
        if last_timestamp >= end_date:
            break
            
        # Set the new start time to the next day after the last candle
        start_ts = int((last_timestamp + timedelta(days=1)).timestamp() * 1000)
        
        # Rate limiting
        time.sleep(0.1)
    
    full_df = pd.concat(all_data).drop_duplicates()
    # Ensure we don't exceed the end_date
    return full_df[full_df.index <= end_date]

# ======================
# FEATURE ENGINEERING
# ======================
def calculate_daily_indicators(daily_df):
    """Calculate all technical indicators on daily data"""
    df = daily_df.copy()
    
    # Ensure no lookahead bias by using shifted close prices
    df['close_shifted'] = df['close'].shift(1)
    df['high_shifted'] = df['high'].shift(1)
    df['low_shifted'] = df['low'].shift(1)
    
    # Momentum Indicators
    df['rsi'] = RSIIndicator(df['close_shifted'], window=14).rsi()
    stoch = StochasticOscillator(
        high=df['high_shifted'],
        low=df['low_shifted'],
        close=df['close_shifted'],
        window=14,
        smooth_window=3
    )
    df['stoch_k'] = stoch.stoch()
    df['stoch_d'] = stoch.stoch_signal()
    
    # Trend Indicators
    macd = MACD(df['close_shifted'], window_slow=26, window_fast=12, window_sign=9)
    df['macd'] = macd.macd()
    df['macd_signal'] = macd.macd_signal()
    df['macd_diff'] = macd.macd_diff()
    
    # Moving Averages
    for window in [9, 20, 50, 100, 200]:
        df[f'ma_{window}'] = SMAIndicator(df['close_shifted'], window=window).sma_indicator()
    
    # Volatility Indicators
    bb = BollingerBands(df['close_shifted'], window=20, window_dev=2)
    df['bb_upper'] = bb.bollinger_hband()
    df['bb_lower'] = bb.bollinger_lband()
    df['bb_width'] = bb.bollinger_wband()
    
    # Lag Features
    for feature in ['rsi', 'stoch_k', 'macd', 'ma_9', 'bb_width']:
        for lag in [1, 3, 5, 7, 14]:  # 1d, 3d, 5d, 1w, 2w lags
            df[f'{feature}_lag{lag}'] = df[feature].shift(lag)
    
    # Market Context Features
    df['daily_return'] = df['close'].pct_change()
    df['volatility_7d'] = df['daily_return'].rolling(7).std()
    df['volume_ma_7d'] = df['volume'].rolling(7).mean()
    
    return df.dropna()

def create_daily_dataset(daily_df, threshold_pct):
    """Create the final dataset with target variable"""
    df = daily_df.copy()
    
    # MODIFIED: Create target for price DECREASE - did price fall below threshold?
    df['target'] = (df['low'] <= df['open'] * (1 - threshold_pct/100)).astype(int)
    
    # Add market open context
    df['overnight_return'] = (df['open'] - df['close'].shift(1)) / df['close'].shift(1)
    df['intraday_range'] = (df['high'] - df['low']) / df['open']
    
    return df.dropna()

# ======================
# MODEL TRAINING & EVALUATION
# ======================
def prepare_features(daily_df):
    """Select final features and split data"""
    # Exclude raw prices and forward-looking data
    exclude = ['open', 'high', 'low', 'close', 'volume', 'target']
    features = [col for col in daily_df.columns if col not in exclude]
    
    X = daily_df[features]
    y = daily_df['target']
    
    # Time-based split
    tscv = TimeSeriesSplit(n_splits=5)
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    return X_train, X_test, y_train, y_test

def evaluate_model(model, X_train, y_train, X_test, y_test):
    """Enhanced evaluation with threshold optimization"""
    # Get predictions
    y_proba_train = model.predict_proba(X_train)[:, 1]
    y_proba_test = model.predict_proba(X_test)[:, 1]
    
    # Find optimal threshold on TRAIN set
    precision, recall, thresholds = precision_recall_curve(y_train, y_proba_train)
    f1_scores = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-8)
    optimal_idx = np.argmax(f1_scores)
    optimal_threshold = thresholds[optimal_idx]
    
    # Evaluate both sets at optimal threshold
    y_pred_train = (y_proba_train >= optimal_threshold).astype(int)
    y_pred_test = (y_proba_test >= optimal_threshold).astype(int)
    
    # Calculate metrics
    def get_metrics(y_true, y_pred):
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        cm = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = cm.ravel()
        return accuracy, precision, recall, f1, tn, fp, fn, tp
    
    train_metrics = get_metrics(y_train, y_pred_train)
    test_metrics = get_metrics(y_test, y_pred_test)
    
    # Print comparison
    print(f"\nOptimal Threshold: {optimal_threshold:.4f}")
    print("\n{:<15} {:<10} {:<10} {:<10} {:<10}".format(
        'Set', 'Accuracy', 'Precision', 'Recall', 'F1'))
    print("{:<15} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
        'Train', train_metrics[0], train_metrics[1], train_metrics[2], train_metrics[3]))
    print("{:<15} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
        'Test', test_metrics[0], test_metrics[1], test_metrics[2], test_metrics[3]))
    
    print("\nTrain Confusion Matrix:")
    print(f"[[TN: {train_metrics[4]} FP: {train_metrics[5]}]")
    print(f" [FN: {train_metrics[6]} TP: {train_metrics[7]}]]")
    
    print("\nTest Confusion Matrix:")
    print(f"[[TN: {test_metrics[4]} FP: {test_metrics[5]}]")
    print(f" [FN: {test_metrics[6]} TP: {test_metrics[7]}]]")
    
    # Calculate overfitting gap
    overfitting_gap = train_metrics[3] - test_metrics[3]
    print(f"\nOverfitting Gap (F1): {overfitting_gap:.4f}")
    
    if overfitting_gap > 0.1:
        print("\nWarning: Potential overfitting (F1 gap > 0.1)")
    elif overfitting_gap > 0.05:
        print("\nNote: Moderate overfitting (F1 gap > 0.05)")
    else:
        print("\nNo significant overfitting detected")
    
    return y_proba_test, optimal_threshold

# ======================
# MODIFIED MAIN EXECUTION FOR MULTIPLE THRESHOLDS
# ======================
def evaluate_multiple_thresholds(daily_with_indicators, threshold_list):
    """Evaluate model performance across multiple threshold percentages with consistent test set"""
    results = []
    
    # Create base dataset (without target)
    daily_data_full = create_daily_dataset(daily_with_indicators, threshold_pct=1.0)  # Dummy threshold
    X_full = daily_data_full[[c for c in daily_data_full.columns if c not in ['open','high','low','close','volume','target']]]
    
    # Single time-based split for all thresholds
    tscv = TimeSeriesSplit(n_splits=3)
    for train_idx, test_idx in tscv.split(X_full):
        X_train, X_test = X_full.iloc[train_idx], X_full.iloc[test_idx]
        y_train_all, y_test_all = daily_data_full.iloc[train_idx], daily_data_full.iloc[test_idx]
    
    # Get consistent latest observation across all thresholds
    last_data = X_test.iloc[[-1]]
    last_timestamp = last_data.index[0]
    last_open = daily_data_full.loc[last_timestamp, 'open']
    last_low = daily_data_full.loc[last_timestamp, 'low']
    
    for threshold_pct in threshold_list:
        
        # Create targets for price DECREASE using consistent splits
        y_train = (y_train_all['low'] <= y_train_all['open']*(1 - threshold_pct/100)).astype(int)
        y_test = (y_test_all['low'] <= y_test_all['open']*(1 - threshold_pct/100)).astype(int)
        
        # Train model
        model = xgb.XGBClassifier(
            objective='binary:logistic',
            max_depth=4,
            learning_rate=0.03,
            n_estimators=150,
            scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),
            early_stopping_rounds=20
        )
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)
        
        # Get predictions
        y_proba = model.predict_proba(X_test)[:,1]
        
        # Optimal threshold selection
        precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
        f1_scores = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-8)
        optimal_threshold = thresholds[np.argmax(f1_scores)]
        
        # Latest prediction
        last_proba = model.predict_proba(last_data)[0,1]
        last_pred = int(last_proba >= optimal_threshold)
        actual_move = int(last_low <= last_open*(1 - threshold_pct/100))
        
        # Test set metrics
        y_pred = (y_proba >= optimal_threshold).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        # Get train predictions and metrics
        y_proba_train = model.predict_proba(X_train)[:, 1]
        y_pred_train = (y_proba_train >= optimal_threshold).astype(int)
        tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, y_pred_train).ravel()

        results.append({
            'Threshold (%)': threshold_pct,
            'Optimal_Threshold': optimal_threshold,
            
            # Test metrics
            'Test_Accuracy': accuracy_score(y_test, y_pred),
            'Test_Precision': precision_score(y_test, y_pred),
            'Test_Recall': recall_score(y_test, y_pred),
            'Test_F1': f1_score(y_test, y_pred),
            'Test_Sensitivity': tp/(tp+fn),
            'Test_Specificity': tn/(tn+fp),
            
            # Train metrics
            'Train_Accuracy': accuracy_score(y_train, y_pred_train),
            'Train_Precision': precision_score(y_train, y_pred_train),
            'Train_Recall': recall_score(y_train, y_pred_train),
            'Train_F1': f1_score(y_train, y_pred_train),
            'Train_Sensitivity': tp_train/(tp_train+fn_train),
            'Train_Specificity': tn_train/(tn_train+fp_train),
            
            # Overfitting gaps
            'Accuracy_Gap': accuracy_score(y_train, y_pred_train) - accuracy_score(y_test, y_pred),
            'F1_Gap': f1_score(y_train, y_pred_train) - f1_score(y_test, y_pred),
            
            # Prediction info
            'Latest_Proba': last_proba,
            'Latest_Pred': last_pred,
            'Actual_Move': actual_move,
            'Open_Price': last_open,
            'Target_Price': last_open * (1 - threshold_pct/100),  # Price DECREASE target
            'Timestamp': last_timestamp.strftime('%Y-%m-%d'),
            'Current_Time_UTC+3': (datetime.utcnow() + timedelta(hours=3)).strftime('%Y-%m-%d %H:%M:%S')
        })
    
    return pd.DataFrame(results)

# ======================
# MAIN EXECUTION
# ======================
if __name__ == "__main__":
    print("Fetching daily data...")
    daily_data = fetch_daily_data(SYMBOL, START_DATE)
    
    # Print last available datetime in dataset
    last_data_datetime = daily_data.index[-1]
    print(f"\nLast available data in dataset: {last_data_datetime.strftime('%Y-%m-%d %H:%M:%S')}")
    
    print("\nCalculating indicators...")
    daily_with_indicators = calculate_daily_indicators(daily_data)
    
    print("\nEvaluating multiple thresholds...")
    results_df = evaluate_multiple_thresholds(daily_with_indicators, THRESHOLD_PCT)
    
    # Get prediction date (last date in test set)
    prediction_date = pd.to_datetime(results_df['Timestamp'].iloc[0])
    print(f"\nPrediction date: {prediction_date.strftime('%Y-%m-%d')}")
    print(f"Current time (UTC+3): {results_df['Current_Time_UTC+3'].iloc[0]}")
    
    # Print summary of all thresholds
    print("\n" + "="*120)
    print("SUMMARY OF ALL THRESHOLDS (PRICE DECREASE PREDICTION)".center(120))
    print("="*120)
    print(results_df.round(2).to_string(index=False, justify='center'))
    print("="*120)

Fetching daily data...

Last available data in dataset: 2025-05-18 00:00:00

Calculating indicators...

Evaluating multiple thresholds...

Prediction date: 2025-05-18
Current time (UTC+3): 2025-05-18 17:25:14

                                 SUMMARY OF ALL THRESHOLDS (PRICE DECREASE PREDICTION)                                  
 Threshold (%)  Optimal_Threshold  Test_Accuracy  Test_Precision  Test_Recall  Test_F1  Test_Sensitivity  Test_Specificity  Train_Accuracy  Train_Precision  Train_Recall  Train_F1  Train_Sensitivity  Train_Specificity  Accuracy_Gap  F1_Gap  Latest_Proba  Latest_Pred  Actual_Move  Open_Price  Target_Price Timestamp   Current_Time_UTC+3
     0.1              0.20             0.96            0.96          1.00       0.98          1.00              0.04             0.99            0.99            0.99        0.99          0.99               0.73             0.02       0.01       0.50          1            0        103126.65    103023.52   2025-05-18 2025-05-18 17:2