In [None]:
import datetime
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, ParameterGrid
import os
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import matplotlib.pyplot as plt
from binance_historical_data import BinanceDataDumper

from joblib import Parallel, delayed
from sklearn.model_selection import ParameterGrid
from scipy.stats import entropy



# Download data functions

In [None]:

def download_data_from_binance(symbol, temporalidad, start_year, end_year):
    dumper = BinanceDataDumper(
        path_dir_where_to_dump="data",
        asset_class="spot",
        data_type="klines",
        data_frequency=temporalidad
    )

    current = datetime.date(start_year, 1, 1)
    end = datetime.date(end_year, 12, 31)

    while current <= end:
        next_month = (current.replace(day=1) + datetime.timedelta(days=32)).replace(day=1)
        print(f"Bajando {current} a {next_month - datetime.timedelta(days=1)}")
        dumper.dump_data(
            tickers=[symbol],
            date_start=current,
            date_end=next_month - datetime.timedelta(days=1)
        )
        current = next_month


In [None]:
def parse_binance_files(folder_path, lookback, future_candles_to_predict, use_time_filter=False, expected_timeframe_seconds=None):
    column_names = [
        'open_time', 'open', 'high', 'low', 'close', 'volume',
        'close_time', 'quote_asset_volume', 'number_of_trades',
        'taker_buy_volume_base', 'taker_buy_volume_quote', 'ignore'
    ]

    float_cols = ['open', 'high', 'low', 'close', 'volume',
                'quote_asset_volume', 'taker_buy_volume_base', 'taker_buy_volume_quote']

    dfs = []
    files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".csv")]

    for f in files:
        df = pd.read_csv(f, names=column_names)
        for unit in ['ms', 'us']:
            try:
                df['date'] = pd.to_datetime(df['open_time'], unit=unit)
                df['close_time'] = pd.to_datetime(df['close_time'], unit=unit)
                delta = df['date'].diff().dt.total_seconds().dropna()
                mode_delta = delta.mode().iloc[0]
                if expected_timeframe_seconds is None or mode_delta == expected_timeframe_seconds:
                    break
            except Exception:
                continue
        else:
            print(f"Archivo descartado por timeframe: {f}")
            continue

        df[float_cols] = df[float_cols].astype(float)
        df = df.drop(columns='ignore')
        dfs.append(df)

    if not dfs:
        print("No se cargaron archivos.")
        return pd.DataFrame(), []

    df = pd.concat(dfs).sort_values('open_time').reset_index(drop=True)
    df['hora'] = df['date'].dt.time

    df = df.set_index('date')
    if use_time_filter:
        df = df.between_time("14:30", "21:00")
        if df.empty:
            raise ValueError("No hay datos luego del filtro horario")
    df = df.reset_index()

    df['ema20'] = df['close'].ewm(span=20, adjust=False).mean()
    df['tendency'] = np.where(df['close'] > df['ema20'], 1, -1)
    df['prev_tendency'] = df['tendency'].shift(1)
    df['type'] = np.where(df['close'] > df['open'], 'up', 'dw')
    df['type_encoded'] = df['type'].map({'dw': 0, 'up': 1})
    df['size'] = abs(df['close'] - df['open'])
    df['whole_size'] = df['high'] - df['low']
    df['size_class'] = pd.qcut(df['size'], q=3, labels=['small', 'medium', 'large'], duplicates='drop')
    df['size_encoded'] = df['size_class'].map({'small': 0, 'medium': 1, 'large': 2})
    df['ema_htf'] = df['close'].ewm(span=200, adjust=False).mean()
    df['tendency_htf'] = np.where(df['close'] > df['ema_htf'], 1, -1)

    roll = df['volume'].rolling(20)
    df['volume_norm'] = (df['volume'] - roll.mean()) / roll.std().replace(0, np.nan)

    df['future_open'] = df['open'].shift(-future_candles_to_predict)
    df['future_close'] = df['close'].shift(-future_candles_to_predict)
    df['future_volume'] = df['volume'].shift(-future_candles_to_predict + 1).rolling(window=future_candles_to_predict).sum()
    df['target'] = np.where(df['future_close'] > df['future_open'], 1, 0)

    o1 = df['open'].shift(1)
    c1 = df['close'].shift(1)
    o2 = df['open'].shift(2)
    c2 = df['close'].shift(2)
    engulf_up = (c1 > o1) & (c2 < o2) & (o1 < c2) & (c1 > o2)
    engulf_dw = (c1 < o1) & (c2 > o2) & (o1 > c2) & (c1 < o2)
    df['engulf'] = 0
    df.loc[engulf_up, 'engulf'] = 1
    df.loc[engulf_dw, 'engulf'] = -1

    shifts = [df['type_encoded'].shift(i).rename(f'prev_{i}') for i in range(1, lookback + 1)]
    df_shifted = pd.concat([df] + shifts, axis=1)
    df_shifted['prev_volume'] = df_shifted['volume_norm'].shift(1)

    cols_prev = [f'prev_{i}' for i in range(1, lookback + 1)] + ['prev_volume']
    cols_current = ['volume', 'tendency', 'tendency_htf', 'size', 'whole_size', 'type_encoded', 'volume_norm', 'quote_asset_volume', 'number_of_trades', 'taker_buy_volume_base', 'taker_buy_volume_quote']
    cols = cols_prev + cols_current + ['target']

    return df_shifted.dropna(subset=cols), cols[:-1]


# Ejecución con incertidumbre

In [None]:
def analizar_impacto_de_thresholds(df_prep, 
                                features,
                                param_rf_default=None,
                                threshold_range=np.arange(0.50, 0.705, 0.02),
                                n_splits=5,
                                optimize_by="profit"):

    profit_factors = []
    profits = []
    n_trades = []
    thresholds = []
    equities = []

    for thresh in threshold_range:
        global future, use_uncertainty
        use_uncertainty = True

        equity, best_params, best_score, metrics, _ = grid_search_sobre_modelo_parallel(
            grid_params=param_rf_default,
            df=df_prep,
            features=features,
            vis_plot=False,
            optimize_by=optimize_by
        )

        thresholds.append(thresh)
        profit_factors.append(metrics.get("pf", 0))
        profits.append(equity[-1] - equity[0])
        n_trades.append(len(equity) - 1)
        equities.append(equity)

    plot_uncertainty_metrics(thresholds, profit_factors, profits, n_trades)
    plot_subplots_equity(equities, thresholds, profit_factors)


# Grid search

In [None]:

def grid_search_sobre_modelo_parallel(df,
                                    features,
                                    grid_params,
                                    vis_plot):

    def run_single_param(param_set):
        equity, pf, metrics, feat_imp = evaluar_modelo_cv(df,
                                                        features,
                                                        param_set)
        profit = equity[-1]
        drawdown = max_drawdown(equity)
        score_profit = profit
        score_pf = pf
        score_dd = profit / drawdown if drawdown > 0 else -np.inf
        scores = (score_profit, score_pf, score_dd)
        slopest = extract_slope(equity)
        return scores, param_set, equity, metrics, feat_imp, slopest # -> results

    results = Parallel(n_jobs=-1)(delayed(run_single_param)(param_set) for param_set in ParameterGrid(grid_params))

    best_result_profit = max(results, key=lambda x: x[0][0])
    best_result_pf = max(results, key=lambda x: x[0][1])
    best_result_dd = max(results, key=lambda x: x[0][2])
    
    best_score_profit, best_params_profit, best_equity_profit, best_metrics_profit, best_feat_imp_profit, best_slope_profit = best_result_profit
    best_score_pf, best_params_pf, best_equity_pf, best_metrics_pf, best_feat_imp_pf, best_slope_pf = best_result_pf
    best_score_dd, best_params_dd, best_equity_dd, best_metrics_dd, best_feat_imp_dd, best_slope_dd = best_result_dd

    best_equities = (best_equity_profit, best_equity_pf, best_equity_dd)
    best_params = (best_params_profit, best_params_pf, best_params_dd)
    best_score = (best_score_profit, best_score_pf, best_score_dd)
    best_metrics = (best_metrics_profit, best_metrics_pf, best_metrics_dd)
    best_feat_imp = (best_feat_imp_profit, best_feat_imp_pf, best_feat_imp_dd)
    
    if vis_plot:
        equity_plot(best_equity_profit, 
                    best_equity_pf,
                    best_equity_dd)
        
        importances = (best_feat_imp_profit, best_feat_imp_pf, best_feat_imp_dd)
        plot_feature_importances(features, importances)

    return best_equities, best_params, best_score, best_metrics, best_feat_imp


# Evaluación del modelo con cross validation

In [None]:

def evaluar_modelo_cv(df_model_base,
                    features,
                    param_rf,
                    threshold=0.5):

    X = df_model_base[features]
    y = df_model_base['target']
    tscv = TimeSeriesSplit(n_splits=5)
    equity = [0]
    
    accuracies, precisions, recalls, f1s = [], [], [], []
    feature_importances = None

    all_certainties = []
    all_entropies = []

    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        clf = RandomForestClassifier(**param_rf, random_state=0)
        clf.fit(X_train, y_train)

        if train_idx[-1] == train_idx[-1]:
            feature_importances = clf.feature_importances_

        probs = clf.predict_proba(X_test)
        preds = np.argmax(probs, axis=1)
        certainties = np.max(probs, axis=1)
        entropies = entropy(probs.T)

        all_certainties.extend(certainties)
        all_entropies.extend(entropies)

        df_model = df_model_base.iloc[test_idx].copy()
        df_model['pred'] = preds
        df_model['certainty'] = certainties
        df_model['entropy'] = entropies
        df_model['decision'] = 'no_trade'

        if use_uncertainty:
            mask = df_model['certainty'] > threshold

            df_model.loc[mask & (df_model['pred'] == 1), 'decision'] = 'buy'
            df_model.loc[mask & (df_model['pred'] == 0), 'decision'] = 'sell'
            df_model = df_model[mask]
        else:
            df_model['decision'] = np.where(df_model['pred'] == 1, 'buy', 'sell')
        
        df_model['ret'] = np.where(
            df_model['pred'] == 1,
            (df_model['future_close'] - df_model['future_open']) / df_model['future_open'],
            (df_model['future_open'] - df_model['future_close']) / df_model['future_open']
        )
        df_model = df_model.dropna(subset=['ret'])
        
        for r in df_model['ret']:
            equity.append(equity[-1] + r)
        
        accuracies.append(accuracy_score(y_test, preds))
        precisions.append(precision_score(y_test, preds, zero_division=0))
        recalls.append(recall_score(y_test, preds, zero_division=0))
        f1s.append(f1_score(y_test, preds, zero_division=0))

    all_rets = np.diff(equity)
    gains = sum(r for r in all_rets if r > 0)
    losses = abs(sum(r for r in all_rets if r < 0))
    profit_factor = gains / losses if losses > 0 else 0

    metrics = {
        'accuracy': np.mean(accuracies),
        'precision': np.mean(precisions),
        'recall': np.mean(recalls),
        'f1_score': np.mean(f1s),
    }

    """if vis_hist_certainties:
        hist_certainties(all_certainties)"""

    return equity, profit_factor, metrics, feature_importances


In [None]:
def hist_certainties(all_certainties):
    fig, axs = plt.subplots(1, 2, figsize=(14, 5))
    axs[0].hist(all_certainties, bins=50, color='blue', alpha=0.7)
    axs[0].set_title("Distribución de Certeza (max prob)")
    axs[0].set_xlabel("Certeza")
    axs[0].set_ylabel("Frecuencia")
    axs[0].grid(True)
    
    plt.tight_layout()
    plt.show()

In [None]:
def max_drawdown(equity):
    peak = equity[0]
    max_dd = 0
    for x in equity:
        if x > peak:
            peak = x
        dd = peak - x
        if dd > max_dd:
            max_dd = dd
    return max_dd if max_dd > 0 else 1e-9


# Uncertainty functions

In [None]:
def analizar_impacto_de_thresholds(df_prep, 
                                features,
                                param_rf_default=None,
                                threshold_range=np.arange(0.50, 0.705, 0.02),
                                n_splits=5):
    
    if param_rf_default is None:
        param_rf_default = {
            "n_estimators": 100,
            "max_depth": 5
        }

    profit_factors = []
    profits = []
    n_trades = []
    thresholds = []
    equities = []

    for thresh in threshold_range:
        equity, pf, metrics, _ = evaluar_modelo_cv(
            df_model_base=df_prep,
            features=features,
            param_rf=param_rf_default,
            use_uncertainty=True,
            threshold=thresh,
            threshold_entropy=None,
            future=future,
            n_splits=n_splits,
            vis_plot=False
        )

        thresholds.append(thresh)
        profit_factors.append(pf)
        profits.append(equity[-1] - equity[0])
        n_trades.append(len(equity) - 1)
        equities.append(equity)
        
    plot_uncertainty_metrics(thresholds,
                                profit_factors,
                                profits,
                                n_trades,
                                )

    plot_subplots_equity(equities, 
                            thresholds,
                            profit_factors)
    # Gráficos de métricas
    


# Visualization functions

In [None]:
def extract_slope(equity, exp=None):
    
    y = equity
    x = np.arange(len(y), dtype=float)
    slope = float(np.polyfit(x, y, 1)[0])
    #ratio_above = (y > 0).mean()
    #capital_final = float(y.iloc[-1])
    
    return slope


In [None]:
def plot_conditional_equity(df_equity, exp=None):
    
    y = df_equity["capital"]
    x = np.arange(y.size, dtype=float)
    slope = float(np.polyfit(x, y, 1)[0])
    ratio_above = (y > 0).mean()
    capital_final = float(y.iloc[-1])
    
    valid_patterns = []
    if slope > 0 and ratio_above > 0.5 and capital_final > 2*abs(min(y)) :
        plt.figure(figsize=(16, 8))
        plt.plot(df_equity["entry_time"], y, label="Equity")
        plt.xlabel("Fecha")
        plt.ylabel("Capital")
        titulo = f"Curva de Equity"
        if exp:
            titulo += f" para la expansión {exp}"
        plt.title(titulo)
        plt.grid(True)
        plt.legend()
        plt.show()


In [None]:
def plot_feature_importance(features, importances, metric):

    indices = np.argsort(importances)
    plt.figure(figsize=(8, len(features) * 0.3 + 1))
    plt.barh(range(len(features)), importances[indices], align='center')
    plt.yticks(range(len(features)), [features[i] for i in indices])
    plt.xlabel("Importance")
    plt.title(f"Feature importances: {metric}")
    plt.grid(True, axis='x')
    plt.tight_layout()
    plt.show()
    

def plot_feature_importances(features, importances):
    importances_all = {
        'profit': importances[0],
        'profit factor': importances[1],
        'profit drawdown': importances[2]
    }

    indices = np.argsort(importances[0])  # podés cambiar por otra métrica si querés otro orden
    features_sorted = [features[i] for i in indices]

    bar_width = 0.25
    y = np.arange(len(features))

    plt.figure(figsize=(10, len(features) * 0.3 + 2))

    for i, (label, importances) in enumerate(importances_all.items()):
        imp = np.array(importances)[indices]
        plt.barh(y + i * bar_width, imp, bar_width, label=label)

    plt.yticks(y + bar_width, features_sorted)
    plt.xlabel("Importance")
    plt.title("Comparación de Importancias por Métrica")
    plt.legend()
    plt.grid(True, axis='x')
    plt.tight_layout()
    plt.show()

def equity_plot(equity_profit, equity_pf, equity_dd):
    plt.figure(figsize=(12, 6))
    plt.plot(equity_profit, label='profit')
    plt.plot(equity_pf, label='profit factor')
    plt.plot(equity_dd, label='profit drawdown')
    plt.title("Equity Curves Comparadas")
    plt.xlabel("Trades")
    plt.ylabel("Capital")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


def equity_single_plot(equity, metric):
    plt.figure(figsize=(10, 5))
    plt.plot(equity, label='Equity')
    plt.xlabel('Trade Number')
    plt.ylabel('Equity')
    plt.title(f"Equity plot: {metric}")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()


In [None]:
def graficar_equities_por_threshold(df_prep, features,
                                    param_rf_default=None,
                                    future=None,
                                    threshold_range=np.arange(0.50, 0.605, 0.01),
                                    n_splits=5):

    if param_rf_default is None:
        param_rf_default = {
            "n_estimators": 100,
            "max_depth": 5
        }

    equities = []
    thresholds = []

    for thresh in threshold_range:
        equity, _, _, _ = evaluar_modelo_cv(
            df_model_base=df_prep,
            features=features,
            param_rf=param_rf_default,
            use_uncertainty=True,
            threshold=thresh,
            threshold_entropy=None,
            future=future,
            n_splits=n_splits,
            vis_plot=False
        )
        equities.append(equity)
        thresholds.append(thresh)

    # Plot
    n = len(thresholds)
    fig, axs = plt.subplots(n, 1, figsize=(10, 2.5 * n), sharex=False)

    for i, (eq, t) in enumerate(zip(equities, thresholds)):
        axs[i].plot(eq)
        axs[i].set_title(f"Equity - Threshold={t:.2f}")
        axs[i].set_ylabel("Equity")
        axs[i].grid(True)

    axs[-1].set_xlabel("Trade #")
    plt.tight_layout()
    plt.show()


In [None]:
    def plot_uncertainty_metrics(thresholds,
                                profit_factors,
                                profits,
                                n_trades,
                                ):
        fig, axs = plt.subplots(3, 1, figsize=(10, 12), sharex=True)

        axs[0].plot(thresholds, profit_factors, marker='o')
        axs[0].set_ylabel("Profit Factor")
        axs[0].set_title("Profit Factor vs Threshold")
        axs[0].grid(True)

        axs[1].plot(thresholds, profits, marker='o', color='green')
        axs[1].set_ylabel("Profit Net")
        axs[1].set_title("Profit Neto vs Threshold")
        axs[1].grid(True)

        axs[2].plot(thresholds, n_trades, marker='o', color='orange')
        axs[2].set_ylabel("Cantidad de Trades")
        axs[2].set_xlabel("Threshold de Certeza")
        axs[2].set_title("Cantidad de Trades vs Threshold")
        axs[2].grid(True)

        plt.tight_layout()
        plt.show()

    # Gráficos de equity
    def plot_subplots_equity(equities, 
                            thresholds,
                            profit_factors):
        fig, axes = plt.subplots(len(equities), 1, figsize=(12, 3 * len(equities)), sharex=False)
        if len(equities) == 1:
            axes = [axes]
        for i, (eq, th, pf) in enumerate(zip(equities, thresholds, profit_factors)):
            axes[i].plot(eq)
            axes[i].set_title(f"Equity Curve - Threshold = {th:.2f} | PF = {pf:.2f}")
            axes[i].set_ylabel("Equity")
            axes[i].grid(True)
        axes[-1].set_xlabel("Trade #")
        plt.tight_layout()
        plt.show()



# Legacy functions

In [None]:

def correr_todo():
    best_overall_score = -np.inf
    best_overall_params = None
    best_overall_equity = None
    best_overall_future = None
    best_overall_past = None

    # Diccionario para guardar la mejor equity de cada past
    best_equity_per_past = {}
    best_score_per_past = {}
    best_future_per_past = {}

    for future in future_windows:
        plt.figure(figsize=(12, 6))
        best_score_future = -np.inf
        best_equities_future = []
        labels_future = []
        print(f"--- Future: {future} ---")
        
        for past in past_windows:
            print(f"Evaluando past={past}")
            df_prep, features = parse_binance_files(folder_path, past, future, use_time_filter)
            
            is_last = (past == past_windows[-1]) and (future == future_windows[-1])

        if is_last:
            equity, params, score, metrics, feat_imp = grid_search_sobre_modelo(
                grid_params,
                df_prep,
                features,
                use_uncertainty,
                future=future,
                vis=True,
                vis_plot=True,
                optimize_by=scoring
            )
        else:
            equity, params, score, metrics, feat_imp = grid_search_sobre_modelo_parallel(
                grid_params,
                df_prep,
                features,
                use_uncertainty,
                future=future,
                vis=False,
                vis_plot=False,
                optimize_by=scoring
            )

            
            best_equities_future.append(equity)
            labels_future.append(f"past={past}, score={score:.2f}")
            
            if score > best_score_future:
                best_score_future = score
                best_equity_future = equity
                best_params_future = params
                best_past_future = past
                best_feature_imp = feat_imp
            
            if (past not in best_score_per_past) or (score > best_score_per_past[past]):
                best_score_per_past[past] = score
                best_equity_per_past[past] = equity
                best_future_per_past[past] = future
            
            if score > best_overall_score:
                best_overall_score = score
                best_overall_params = params
                best_overall_equity = equity
                best_overall_future = future
                best_overall_past = past
                best_overall_feature_imp = feat_imp
                features_of_the_overall_best = features

        """for eq, label in zip(best_equities_future, labels_future):
            plt.plot(eq, label=label)
        plt.title(f"Equity curves - Future={future}")
        plt.xlabel("Trade Number")
        plt.ylabel("Equity")
        plt.legend()
        plt.grid(True)
        plt.show()"""

    print()
    print("Mejores equity curves: ")
    plt.figure(figsize=(12, 6))
    for past, equity in best_equity_per_past.items():
        future = best_future_per_past[past]
        plt.plot(equity, label=f"Best past={past}, future={future}")
    plt.title("Mejores Equity Curves por cada Past (superpuestas)")
    plt.xlabel("Trade Number")
    plt.ylabel("Equity")
    plt.legend()
    plt.grid(True)
    plt.show()

    best_results = {
        'score': best_overall_score,
        'params': best_overall_params,
        'equity': best_overall_equity,
        'future': best_overall_future,
        'past': best_overall_past,
        'feature_importance': best_overall_feature_imp,
        'features': features_of_the_overall_best
    }

    filename = f"best_results_past{past}_future{future}_timefilter{use_time_filter}_uncertainty{use_uncertainty}_tresh{threshold}_scoring{scoring}.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(best_results, f)

    print(f"Mejor combinación general: future={best_overall_future}, past={best_overall_past}")
    print(f"Mejores parámetros: {best_overall_params}")
