In [24]:
import numpy as np
import pandas as pd
import random
from scipy.signal import argrelextrema
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from hmmlearn import hmm
import talib
import re
import os
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
import xgboost
from xgboost import XGBClassifier
from sklearn.svm import SVC
import datetime
import glob


#updated to handle multiple assets' OHLC data as training input
#from pathlib import Path



In [29]:
# ------------ CONFIGURABLE PARAMETERS ------------
N_MODELS = 3                                       # Number of MC-sim ensemble models
TRAIN_PATH = '20250202-20170908_BTC-USDT_1D.csv'
NEW_DATA_PATH = '20250127-20170907_XRP-USDT_1H.csv'
DEFAULT_LOOKBACK_WINDOW = 20                        # Used for classic technical/feature rolling
DEFAULT_VOL_WINDOW = 20                             # Volatility rolling window
DEFAULT_CHOP_WINDOW = 20                            # Choppiness rolling window
UPDOWN_PERIOD = 14                                  # For up/down period calculations
UPDOWN_BIN_OFFSET = 1                               # For upbin result
RSI_PERIOD = 14                                     # For custom RSI/gain/loss
MFI_WINDOW = 14                                     # Money Flow Index window
AVG_GAIN_WINDOW = 14                                # For avg gain/loss
LAG_FEATURES = 3                                    # Number of lags to compute for each feature
MA1 = 5                                             # Short moving avg window
MA2 = 10                                            # Long moving avg window

train_files = [
    '20250202-20170908_BTC-USDT_1D.csv',
    '20250202-20170908_XLM-USDT_1D.csv',
    #'ANOTHER_ASSET2.csv',
    # Add any number of asset CSVs you want here
]


new_files = [
    '20250127-20170907_XRP-USDT_1H.csv',
    # Add more new files (ensure matched list lengths)
]


In [26]:
# ------------ FEATURE FUNCTIONS -------------
def get_up_or_down(df, period):
    df['gain_{}'.format(period)] = 0.0
    df['loss_{}'.format(period)] = 0.0
    for i in range(1, len(df)):
        diff = df.iloc[i]['close'] - df.iloc[i-1]['close']
        df.at[df.index[i], 'gain_{}'.format(period)] = max(diff, 0)
        df.at[df.index[i], 'loss_{}'.format(period)] = max(-diff, 0)
    return df

def get_up_or_down_bin(df, offset):
    df['updown_{}'.format(offset)] = 0
    for i in range(offset, len(df)):
        if df.iloc[i]['close'] > df.iloc[i-offset]['close']:
            df.at[df.index[i], 'updown_{}'.format(offset)] = 1
        elif df.iloc[i]['close'] < df.iloc[i-offset]['close']:
            df.at[df.index[i], 'updown_{}'.format(offset)] = -1
    return df

def get_average_gains(df, period):
    df['ag_{}'.format(period)] = 0.0
    df['al_{}'.format(period)] = 0.0
    for i in range(period, len(df)):
        up = df['gain_{}'.format(period)].iloc[i-period+1:i+1].mean()
        down = df['loss_{}'.format(period)].iloc[i-period+1:i+1].mean()
        df.at[df.index[i], 'ag_{}'.format(period)] = up
        df.at[df.index[i], 'al_{}'.format(period)] = down
    return df

def get_relative_strength(df, period):
    df = get_up_or_down(df, period)
    df = get_average_gains(df, period)
    df['rs_{}'.format(period)] = 0.0
    df['rsi_{}'.format(period)] = 0.0
    for i in range(period, len(df)):
        if df.at[df.index[i], 'al_{}'.format(period)] != 0:
            rs = df.at[df.index[i], 'ag_{}'.format(period)] / df.at[df.index[i], 'al_{}'.format(period)]
            df.at[df.index[i], 'rs_{}'.format(period)] = rs
            df.at[df.index[i], 'rsi_{}'.format(period)] = 100 - (100 / (1 + rs))
        else:
            df.at[df.index[i], 'rs_{}'.format(period)] = 0
            df.at[df.index[i], 'rsi_{}'.format(period)] = 100
    return df

def money_flow_index(df, window):
    tp = (df['high'] + df['low'] + df['close']) / 3
    rmf = tp * df['volume']
    pos_mf = []
    neg_mf = []
    for i in range(1, len(df)):
        if tp.iloc[i] > tp.iloc[i-1]:
            pos_mf.append(rmf.iloc[i])
            neg_mf.append(0)
        elif tp.iloc[i] < tp.iloc[i-1]:
            pos_mf.append(0)
            neg_mf.append(rmf.iloc[i])
        else:
            pos_mf.append(0)
            neg_mf.append(0)
    pos_mf = pd.Series([0] + pos_mf, index=df.index)
    neg_mf = pd.Series([0] + neg_mf, index=df.index)
    pos_mf_rolling = pos_mf.rolling(window).sum()
    neg_mf_rolling = neg_mf.rolling(window).sum()
    mfr = pos_mf_rolling / (neg_mf_rolling + 1e-10)
    mfi = 100 - 100 / (1 + mfr)
    df['mfi_{}'.format(window)] = mfi
    return df

def get_ci(df, lookback):
    tr1 = (df['high'] - df['low']).abs()
    tr2 = (df['high'] - df['close'].shift(1)).abs()
    tr3 = (df['low'] - df['close'].shift(1)).abs()
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    atr = tr.rolling(1).mean()
    highh = df['high'].rolling(lookback).max()
    lowl = df['low'].rolling(lookback).min()
    ci = 100 * np.log10((atr.rolling(lookback).sum()) / (highh - lowl)) / np.log10(lookback)
    df['ci_{}'.format(lookback)] = ci
    return df

In [38]:
# ----------------------------
# MARKET REGIME ANALYZER CLASS
# ----------------------------
class MarketRegimeAnalyzer:
    def __init__(self, ohlc_df, lookback_window=20, volatility_threshold=0.5, chop_threshold=0.5):
        self.df = self.normalize_column_names(ohlc_df.copy())
        self.validate_input_columns()
        self.lookback_window = lookback_window
        self.volatility_threshold = volatility_threshold
        self.chop_threshold = chop_threshold
        self.state_labels = {
            0: 'Rising',
            1: 'Falling',
            2: 'Steady',
            3: 'Choppy',
            4: 'No Label'
        }

    def normalize_column_names(self, df):
        column_mapping = {}
        for col in df.columns:
            normalized = col.strip().lower()
            normalized = re.sub(r'[^a-z0-9]', '', normalized)
            column_mapping[col] = normalized
        required_columns = {
            'open': ['open', 'op', 'o'],
            'high': ['high', 'hi', 'h'],
            'low': ['low', 'lo', 'l'],
            'close': ['close', 'cl', 'c', 'last'],
            'volume': ['volume', 'vol', 'v', 'qty']
        }
        final_mapping = {}
        available_columns = set(column_mapping.values())
        for standard_name, variants in required_columns.items():
            for variant in variants:
                if variant in available_columns:
                    for orig_col, normalized_col in column_mapping.items():
                        if normalized_col == variant:
                            final_mapping[standard_name] = orig_col
                            break
                    break
        df = df.rename(columns={v: k for k, v in final_mapping.items()})
        return df

    def validate_input_columns(self):
        required_columns = ['open', 'high', 'low', 'close', 'volume']
        missing = [col for col in required_columns if col not in self.df.columns]
        if missing:
            raise ValueError(f"Missing required columns after normalization: {missing}")
        for col in required_columns:
            self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
        self.df = self.df.dropna(subset=required_columns)

    def preprocess_data(self):
        self.df['returns'] = self.df['close'].pct_change()
        self.df['volatility'] = self.df['close'].rolling(window=self.lookback_window).std()
        self.df['avg_volatility'] = self.df['volatility'].rolling(window=self.lookback_window).mean()
        self.df['norm_volatility'] = self.df['volatility'] / self.df['avg_volatility']
        self.df['sma'] = talib.SMA(self.df['close'], timeperiod=self.lookback_window)
        self.df['ema'] = talib.EMA(self.df['close'], timeperiod=self.lookback_window)
        self.df['adx'] = talib.ADX(self.df['high'], self.df['low'], self.df['close'], timeperiod=self.lookback_window)
        self.df['rsi'] = talib.RSI(self.df['close'], timeperiod=self.lookback_window)
        self.df['volume_sma'] = talib.SMA(self.df['volume'], timeperiod=self.lookback_window)
        self.df['volume_change'] = self.df['volume'] / self.df['volume_sma']
        self.df['local_min'] = self.df['close'] == self.df['close'].rolling(window=5, center=True).min()
        self.df['local_max'] = self.df['close'] == self.df['close'].rolling(window=5, center=True).max()
        self.df['atr'] = talib.ATR(self.df['high'], self.df['low'], self.df['close'], timeperiod=self.lookback_window)
        # Choppiness index
        self.df['chop'] = 100 * np.log10(self.df['atr'].rolling(window=self.lookback_window).sum() /
              (self.df['high'].rolling(window=self.lookback_window).max() -
               self.df['low'].rolling(window=self.lookback_window).min())) / np.log10(self.lookback_window)
        self.df = self.df.dropna()

    def label_states(self):
        self.df['state'] = 4 # Default to 'No Label'
        min_indices = argrelextrema(self.df['close'].values, np.less_equal, order=5)[0]
        max_indices = argrelextrema(self.df['close'].values, np.greater_equal, order=5)[0]
        for i in range(len(min_indices)-1):
            start_idx = min_indices[i]
            end_candidates = max_indices[max_indices > start_idx]
            end_idx = end_candidates[0] if len(end_candidates) > 0 else len(self.df)-1
            if all(self.df['close'].iloc[start_idx:end_idx+1] >= self.df['close'].iloc[start_idx]):
                self.df.loc[self.df.index[start_idx]:self.df.index[end_idx],'state'] = 0
        for i in range(len(max_indices)-1):
            start_idx = max_indices[i]
            end_candidates = min_indices[min_indices > start_idx]
            end_idx = end_candidates[0] if len(end_candidates) > 0 else len(self.df)-1
            if all(self.df['close'].iloc[start_idx:end_idx+1] <= self.df['close'].iloc[start_idx]):
                self.df.loc[self.df.index[start_idx]:self.df.index[end_idx],'state'] = 1
        steady_mask = (self.df['norm_volatility'] < self.volatility_threshold) & (self.df['state'] == 4)
        self.df.loc[steady_mask, 'state'] = 2
        chop_mask = (self.df['chop'] > self.chop_threshold) & (self.df['adx'] < 25) & (self.df['state'] == 4)
        self.df.loc[chop_mask, 'state'] = 3

    def prepare_model_data(self):
        # Expanded feature list (using parameter variables)
        feature_cols = [
            'returns','volatility','norm_volatility','sma','ema','adx','rsi','volume_change','chop',
            f'gain_{UPDOWN_PERIOD}',f'loss_{UPDOWN_PERIOD}',f'updown_{UPDOWN_BIN_OFFSET}',
            f'ag_{AVG_GAIN_WINDOW}',f'al_{AVG_GAIN_WINDOW}',f'rs_{RSI_PERIOD}',f'rsi_{RSI_PERIOD}',
            f'mfi_{MFI_WINDOW}',f'ci_{self.lookback_window}'
        ]
        # Generate lags and smoothers
        for col in feature_cols:
            if col in self.df.columns:
                for lag in range(1, LAG_FEATURES + 1):
                    self.df[f'{col}_lag{lag}'] = self.df[col].shift(lag)
                self.df[f'{col}_ma{MA1}'] = self.df[col].rolling(MA1).mean()
                self.df[f'{col}_ma{MA2}'] = self.df[col].rolling(MA2).mean()
        self.df = self.df.dropna()
        self.df_ml = self.df[self.df['state'] != 4].copy()
        numeric_cols = self.df_ml.select_dtypes(include=[np.number]).columns
        exclude_cols = ['state', 'local_min', 'local_max']
        model_features = [col for col in numeric_cols if col not in exclude_cols]
        self.X = self.df[model_features]
        self.y = self.df['state']
        self.model_features = model_features 

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, shuffle=False)
        self.scaler = StandardScaler()
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)

    def train_models(self):
        """Train logistic regression, random forest, neural network, XGBoost, and SVM models. Return all models and their test set metrics."""
        label_keys = list(self.state_labels.keys())
        label_names = list(self.state_labels.values())

        # Logistic Regression
        self.lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
        self.lr_model.fit(self.X_train_scaled, self.y_train)
        lr_pred = self.lr_model.predict(self.X_test_scaled)
        lr_report_str = classification_report(
            self.y_test, lr_pred, labels=label_keys, target_names=label_names, zero_division=0
        )
        lr_report_dict = classification_report(
            self.y_test, lr_pred, labels=label_keys, target_names=label_names, output_dict=True, zero_division=0
        )
        print("Logistic Regression Performance:")
        print(lr_report_str)

        # Random Forest
        self.rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
        self.rf_model.fit(self.X_train, self.y_train)
        rf_pred = self.rf_model.predict(self.X_test)
        rf_report_str = classification_report(
            self.y_test, rf_pred, labels=label_keys, target_names=label_names, zero_division=0
        )
        rf_report_dict = classification_report(
            self.y_test, rf_pred, labels=label_keys, target_names=label_names, output_dict=True, zero_division=0
        )
        print("\nRandom Forest Performance:")
        print(rf_report_str)

        # Neural Network
        self.nn_model = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu',
                                      solver='adam', max_iter=1000, random_state=42)
        self.nn_model.fit(self.X_train_scaled, self.y_train)
        nn_pred = self.nn_model.predict(self.X_test_scaled)
        nn_report_str = classification_report(
            self.y_test, nn_pred, labels=label_keys, target_names=label_names, zero_division=0
        )
        nn_report_dict = classification_report(
            self.y_test, nn_pred, labels=label_keys, target_names=label_names, output_dict=True, zero_division=0
        )
        print("\nNeural Network Performance:")
        print(nn_report_str)

        # XGBoost/GBM
        self.xgb_model = XGBClassifier(
            objective='multi:softmax',
            num_class=len(self.state_labels),
            eval_metric='mlogloss',
            use_label_encoder=False,
            verbosity=0,
            random_state=42
        )
        unique_classes = np.array(list(self.state_labels.keys()))
        present_classes = np.unique(self.y_train)
        missing_classes = np.setdiff1d(unique_classes, present_classes)
        
        if missing_classes.size > 0:
            # Add a dummy row for each missing class to X_train and y_train
            dummy_rows = []
            for m in missing_classes:
                dummy_row = self.X_train.iloc[[-1]].copy() # Duplicate last row
                dummy_rows.append(dummy_row)
            X_train_extended = pd.concat([self.X_train] + dummy_rows, ignore_index=True)
            y_train_extended = pd.concat([self.y_train, pd.Series(missing_classes)], ignore_index=True)
            self.xgb_model.fit(X_train_extended, y_train_extended)
        else:
            self.xgb_model.fit(self.X_train, self.y_train)        
        #self.xgb_model.fit(self.X_train, self.y_train, classes=unique_classes)
        
        xgb_pred = self.xgb_model.predict(self.X_test)
        xgb_report_str = classification_report(
            self.y_test, xgb_pred, labels=label_keys, target_names=label_names, zero_division=0
        )
        xgb_report_dict = classification_report(
            self.y_test, xgb_pred, labels=label_keys, target_names=label_names, output_dict=True, zero_division=0
        )
        print("\nXGBoost Performance:")
        print(xgb_report_str)

        # SVM (on scaled data, RBF kernel recommended for non-linearities)
        self.svm_model = SVC(kernel='rbf', probability=True, random_state=42)
        self.svm_model.fit(self.X_train_scaled, self.y_train)
        svm_pred = self.svm_model.predict(self.X_test_scaled)
        svm_report_str = classification_report(
            self.y_test, svm_pred, labels=label_keys, target_names=label_names, zero_division=0
        )
        svm_report_dict = classification_report(
            self.y_test, svm_pred, labels=label_keys, target_names=label_names, output_dict=True, zero_division=0
        )
        print("\nSVM (RBF) Performance:")
        print(svm_report_str)

        self.train_stacking_ensemble(label_keys, label_names)

        # Return all models and metrics
        return {
            "models": {
                "lr": self.lr_model,
                "rf": self.rf_model,
                "nn": self.nn_model,
                "xgb": self.xgb_model,
                "svm": self.svm_model,
            },
            "metrics": {
                "lr": {
                    "classification_report_str": lr_report_str,
                    "classification_report_dict": lr_report_dict,
                },
                "rf": {
                    "classification_report_str": rf_report_str,
                    "classification_report_dict": rf_report_dict,
                },
                "nn": {
                    "classification_report_str": nn_report_str,
                    "classification_report_dict": nn_report_dict,
                },
                "xgb": {
                    "classification_report_str": xgb_report_str,
                    "classification_report_dict": xgb_report_dict,
                },
                "svm": {
                    "classification_report_str": svm_report_str,
                    "classification_report_dict": svm_report_dict,
                },
            }
        }

    def train_stacking_ensemble(self, label_keys, label_names):
        base_estimators = [
            ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),
            ('lr', LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)),
            ('mlp', MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', max_iter=1000, random_state=42)),
            ('xgb', XGBClassifier(objective='multi:softmax', num_class=len(self.state_labels),
                                  eval_metric='mlogloss', use_label_encoder=False, verbosity=0, random_state=42)),
            ('svm', SVC(kernel='rbf', probability=True, random_state=42))
        ]
        meta_learner = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
        self.stacking_model = StackingClassifier(
            estimators=base_estimators,
            final_estimator=meta_learner,
            cv=5,
            passthrough=False,
            n_jobs=-1
        )
        self.stacking_model.fit(self.X_train_scaled, self.y_train)
        stacking_pred = self.stacking_model.predict(self.X_test_scaled)
        print("\nSTACKING ENSEMBLE PERFORMANCE:")
        # Always specify both labels and target_names (EXPLICIT FIX for your ValueError)
        print(classification_report(
            self.y_test, stacking_pred, labels=label_keys, target_names=label_names, zero_division=0
        )) 

    def analyze_state_transitions(self):
        self.hmm_model = hmm.CategoricalHMM(n_components=5, n_iter=100)
        self.hmm_model.fit(self.y.values.reshape(-1, 1))
        self.transition_matrix = self.hmm_model.transmat_

    def run_analysis(self):
        self.preprocess_data()
        self.label_states()
        self.prepare_model_data()
        self.train_models()
        self.analyze_state_transitions()
        
    @staticmethod
    def analyze_multiple_files(train_paths, new_data_paths, n_models=N_MODELS):
        """
        Given a list of TRAIN_PATHs and NEW_DATA_PATHs, process all in sequence.
        Returns a list of tuples: (model_results, transition_matrices, params, ensemble_preds)
        """
        results = []
        for train_path, new_path in zip(train_paths, new_data_paths):
            print(f"\n--- Processing TRAIN: {train_path}, NEW: {new_path} ---")
            train_df = pd.read_csv(train_path, parse_dates=["timestamp"])
            models, trans_matrices, params = train_models_montecarlo(train_df, n_models=n_models)
            ensemble_preds = ensemble_predict(models, params, new_path)
            results.append({
                'train_path': train_path,
                'new_data_path': new_path,
                'models': models,
                'transition_matrices': trans_matrices,
                'params': params,
                'ensemble_preds': ensemble_preds
            })
        return results        

# --------------------------
# MONTE CARLO TRAINING SETUP
# --------------------------
def random_params():
    lookback_window = random.choice([10, 15, 20, 25, 30])
    volatility_threshold = round(random.uniform(0.3, 0.7), 2)
    chop_threshold = round(random.uniform(0.3, 0.7), 2)
    return lookback_window, volatility_threshold, chop_threshold

def train_models_montecarlo(train_df, n_models=N_MODELS):
    models = []
    transition_matrices = []
    model_params = []

    for i in range(n_models):
        lookback_window, volatility_threshold, chop_threshold = random_params()
        analyzer = MarketRegimeAnalyzer(
            train_df.copy(),
            lookback_window=lookback_window,
            volatility_threshold=volatility_threshold,
            chop_threshold=chop_threshold
        )
        analyzer.run_analysis()
        models.append(analyzer)
        transition_matrices.append(analyzer.transition_matrix)
        model_params.append({
            'lookback_window': lookback_window,
            'volatility_threshold': volatility_threshold,
            'chop_threshold': chop_threshold
        })
        print(f"Model {i+1}: window={lookback_window}, vol={volatility_threshold}, chop={chop_threshold}")

    return models, transition_matrices, model_params 

# --------------------------
# ENSEMBLE APPLICATION BLOCK
# --------------------------
def ensemble_predict(models, model_params, new_data_path):
    new_df_raw = pd.read_csv(new_data_path, parse_dates=["timestamp"])
    asset_id = os.path.basename(new_data_path)
    new_df_raw['asset'] = asset_id  # <-- Attach asset to incoming unseen data    
    all_indices = []
    # Make a copy so original data isn't affected
    result_df = new_df_raw.copy()

    for i, analyzer in enumerate(models):
        params = model_params[i]
        param_suffix = f"_window{params['lookback_window']}_vol{params['volatility_threshold']}_chop{params['chop_threshold']}"
        new_df = analyzer.normalize_column_names(new_df_raw.copy())
        analyzer.df = new_df
        analyzer.validate_input_columns()
        analyzer.preprocess_data()
        feature_cols = ['returns', 'volatility', 'norm_volatility', 'sma', 'ema',
                        'adx', 'rsi', 'volume_change', 'chop']
        for col in feature_cols:
            for lag in range(1, 4):
                analyzer.df[f'{col}_lag{lag}'] = analyzer.df[col].shift(lag)
            analyzer.df[f'{col}_ma5'] = analyzer.df[col].rolling(5).mean()
            analyzer.df[f'{col}_ma10'] = analyzer.df[col].rolling(10).mean()
        analyzer.df = analyzer.df.dropna()
        X_new = analyzer.df[analyzer.model_features]
        X_new_scaled = analyzer.scaler.transform(X_new)
        # Store the indices for later merging (assumes all align)
        all_indices.append(analyzer.df.index)

        # Collect predictions for current model
        result_df.loc[analyzer.df.index, f'RF_Prediction{param_suffix}'] = analyzer.rf_model.predict(X_new)
        result_df.loc[analyzer.df.index, f'LR_Prediction{param_suffix}'] = analyzer.lr_model.predict(X_new_scaled)
        result_df.loc[analyzer.df.index, f'MLP_Prediction{param_suffix}'] = analyzer.nn_model.predict(X_new_scaled)
        result_df.loc[analyzer.df.index, f'XGB_Prediction{param_suffix}'] = analyzer.xgb_model.predict(X_new)
        result_df.loc[analyzer.df.index, f'SVM_Prediction{param_suffix}'] = analyzer.svm_model.predict(X_new_scaled)
        # Stacking predictions: only if stacking_model exists
        if hasattr(analyzer, 'stacking_model'):
            result_df.loc[analyzer.df.index, f'StackingPrediction{param_suffix}'] = analyzer.stacking_model.predict(X_new_scaled)

    # Optionally: Restrict to shared indices present in all models
    # shared_indices = set.intersection(*map(set, all_indices))
    # result_df = result_df.loc[shared_indices].reset_index(drop=True)

    return result_df


def markov_transition_matrix(labels):
    """
    Empirical (count-based) transition probability matrix for a sequence of labels.
    """
    states = pd.Series(labels).unique()
    matrix = pd.DataFrame(0, index=states, columns=states, dtype=float)
    for (a, b) in zip(labels[:-1], labels[1:]):
        if (a in states) and (b in states):
            matrix.loc[a, b] += 1
    matrix = matrix.div(matrix.sum(axis=1), axis=0)
    return matrix

def load_and_concat_csvs(paths, parse_dates=None):
    dfs = []
    for p in paths:
        df = pd.read_csv(p, parse_dates=parse_dates)
        dfs.append(df)
    merged_df = pd.concat(dfs, ignore_index=True)
    # OPTIONAL: sort by timestamp if order matters
    if parse_dates is not None and len(parse_dates) > 0:
        main_col = parse_dates[0]
        merged_df = merged_df.sort_values(main_col)
    return merged_df

#multi currency featurization 
def featurize_asset(csv_path):
    df = pd.read_csv(csv_path, parse_dates=["timestamp"])
    # NOTE: All feature functions should act in place and expect a single-asset uninterrupted DataFrame.
    # Insert your or your class-based featurization here (same as before, per CSV)
    # Example:
    df['asset'] = asset_id  # <-- Add asset ID column

    analyzer = MarketRegimeAnalyzer(df.copy(), lookback_window=20, volatility_threshold=0.5, chop_threshold=0.5)
    analyzer.preprocess_data()
    analyzer.label_states()
    analyzer.prepare_model_data()
    # Optionally drop columns not needed after featurization/labeling.
    return analyzer.df

In [41]:
# ----------
# MAIN BLOCK
# ----------
if __name__ == "__main__":
    # Train ensemble of models
    # If you want to use glob to automatically detect files, use:
    # TRAIN_PATHS = glob.glob("./*_USDT_1D.csv")

    # Replace old train_df loading with multi-file loading
    batch_results = MarketRegimeAnalyzer.analyze_multiple_files(train_files, new_files)
    # Save or print the results for each batch
    for result in batch_results:
        print(f"\nResults for train: {result['train_path']} | new: {result['new_data_path']}")
        print(result['ensemble_preds'].head())

    # Take the transition matrix from the first model for demonstration
    analyzer = models[0]  # e.g., "best" or first model
    print("\nHMM State Transition Matrix:")
    hmm_mat = pd.DataFrame(analyzer.transition_matrix,
                           index=list(analyzer.state_labels.values()),
                           columns=list(analyzer.state_labels.values()))
    print(hmm_mat.round(3))

    print("\nEmpirical Transition Matrix (markov_transition_matrix):")
    emp_mat = markov_transition_matrix(analyzer.y.values)
    emp_mat.index = [analyzer.state_labels.get(x, x) for x in emp_mat.index]
    emp_mat.columns = [analyzer.state_labels.get(x, x) for x in emp_mat.columns]
    print(emp_mat.round(3))    
    
    # Apply all models to unseen data for ensemble prediction output
    final_df = ensemble_predict(models, model_params, NEW_DATA_PATH)
    '''
    analyzer = MarketRegimeAnalyzer(new_files[0])
    analyzer.preprocess_data()
    analyzer.label_states()
    analyzer.prepare_model_data()
    analyzer.train_models()
    
    # Markov transition (all states, including 'No Label')
    trans_matrix = markov_transition_matrix(analyzer.df['state'])
    print(trans_matrix)
    '''
    
    print(final_df.head(30))

    print(final_df.head(30))
    # (Optional) Save results
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    outname = f"markov_labeled_result_{timestamp}.csv"
    outdir = './dir'
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    fullname = os.path.join(outdir, outname)    

    final_df.to_csv(fullname, index=False)  
        



--- Processing TRAIN: 20250202-20170908_BTC-USDT_1D.csv, NEW: 20250127-20170907_XRP-USDT_1H.csv ---




Logistic Regression Performance:
              precision    recall  f1-score   support

      Rising       0.64      0.22      0.33       210
     Falling       0.52      0.91      0.66       230
      Steady       0.00      0.00      0.00         7
      Choppy       0.08      0.11      0.09        19
    No Label       0.40      0.19      0.26        63

    accuracy                           0.51       529
   macro avg       0.33      0.29      0.27       529
weighted avg       0.53      0.51      0.45       529


Random Forest Performance:
              precision    recall  f1-score   support

      Rising       0.56      0.32      0.41       210
     Falling       0.50      0.89      0.64       230
      Steady       0.00      0.00      0.00         7
      Choppy       0.00      0.00      0.00        19
    No Label       0.00      0.00      0.00        63

    accuracy                           0.52       529
   macro avg       0.21      0.24      0.21       529
weighted avg    




STACKING ENSEMBLE PERFORMANCE:
              precision    recall  f1-score   support

      Rising       0.61      0.43      0.51       210
     Falling       0.52      0.86      0.65       230
      Steady       0.00      0.00      0.00         7
      Choppy       0.00      0.00      0.00        19
    No Label       0.00      0.00      0.00        63

    accuracy                           0.55       529
   macro avg       0.23      0.26      0.23       529
weighted avg       0.47      0.55      0.48       529

Model 1: window=10, vol=0.35, chop=0.61




Logistic Regression Performance:
              precision    recall  f1-score   support

      Rising       0.62      0.24      0.34       210
     Falling       0.51      0.89      0.65       230
      Steady       0.28      0.40      0.33        25
      Choppy       0.00      0.00      0.00        16
    No Label       0.40      0.08      0.14        48

    accuracy                           0.51       529
   macro avg       0.36      0.32      0.29       529
weighted avg       0.52      0.51      0.45       529


Random Forest Performance:
              precision    recall  f1-score   support

      Rising       0.62      0.39      0.48       210
     Falling       0.53      0.90      0.66       230
      Steady       1.00      0.08      0.15        25
      Choppy       0.00      0.00      0.00        16
    No Label       0.00      0.00      0.00        48

    accuracy                           0.55       529
   macro avg       0.43      0.27      0.26       529
weighted avg    




STACKING ENSEMBLE PERFORMANCE:
              precision    recall  f1-score   support

      Rising       0.59      0.40      0.48       210
     Falling       0.51      0.86      0.64       230
      Steady       0.00      0.00      0.00        25
      Choppy       0.00      0.00      0.00        16
    No Label       0.00      0.00      0.00        48

    accuracy                           0.53       529
   macro avg       0.22      0.25      0.22       529
weighted avg       0.46      0.53      0.47       529

Model 2: window=10, vol=0.67, chop=0.58




Logistic Regression Performance:
              precision    recall  f1-score   support

      Rising       0.62      0.25      0.36       207
     Falling       0.51      0.82      0.63       225
      Steady       0.00      0.00      0.00         0
      Choppy       0.09      0.14      0.11        44
    No Label       1.00      0.04      0.09        45

    accuracy                           0.47       521
   macro avg       0.44      0.25      0.23       521
weighted avg       0.56      0.47      0.43       521


Random Forest Performance:
              precision    recall  f1-score   support

      Rising       0.56      0.26      0.35       207
     Falling       0.51      0.90      0.65       225
      Steady       0.00      0.00      0.00         0
      Choppy       0.00      0.00      0.00        44
    No Label       0.38      0.22      0.28        45

    accuracy                           0.51       521
   macro avg       0.29      0.28      0.26       521
weighted avg    




STACKING ENSEMBLE PERFORMANCE:
              precision    recall  f1-score   support

      Rising       0.70      0.25      0.37       207
     Falling       0.49      0.94      0.64       225
      Steady       0.00      0.00      0.00         0
      Choppy       0.00      0.00      0.00        44
    No Label       0.09      0.02      0.04        45

    accuracy                           0.51       521
   macro avg       0.26      0.24      0.21       521
weighted avg       0.50      0.51      0.43       521

Model 3: window=30, vol=0.34, chop=0.39


  new_df_raw = pd.read_csv(new_data_path, parse_dates=["timestamp"])



Results for train: 20250202-20170908_BTC-USDT_1D.csv | new: 20250127-20170907_XRP-USDT_1H.csv
  timestamp     open     high      low    close        volume    volume_ccy   
0   59:33.5  1.71209  1.73399  1.65599  1.72140  107565.79170  183888.35090  \
1   00:43.5  1.72140  1.79689  1.71599  1.76499   30480.46446   53701.94534   
2   59:42.5  1.75780  1.80000  1.70280  1.71799   37783.23588   65624.52143   
3   00:52.5  1.71819  2.43100  1.71730  2.24980  266882.29630  558934.42530   
4   59:51.4  2.20029  2.33999  2.06000  2.22869  185693.73370  405957.90970   

    volCcyQuote                              asset   
0  183888.35090  20250127-20170907_XRP-USDT_1H.csv  \
1   53701.94534  20250127-20170907_XRP-USDT_1H.csv   
2   65624.52143  20250127-20170907_XRP-USDT_1H.csv   
3  558934.42530  20250127-20170907_XRP-USDT_1H.csv   
4  405957.90970  20250127-20170907_XRP-USDT_1H.csv   

   RF_Prediction_window10_vol0.35_chop0.61  ...   
0                                      NaN  ...  \
1  

  new_df_raw = pd.read_csv(new_data_path, parse_dates=["timestamp"])


   timestamp     open     high      low    close        volume    volume_ccy   
0    59:33.5  1.71209  1.73399  1.65599  1.72140  107565.79170  183888.35090  \
1    00:43.5  1.72140  1.79689  1.71599  1.76499   30480.46446   53701.94534   
2    59:42.5  1.75780  1.80000  1.70280  1.71799   37783.23588   65624.52143   
3    00:52.5  1.71819  2.43100  1.71730  2.24980  266882.29630  558934.42530   
4    59:51.4  2.20029  2.33999  2.06000  2.22869  185693.73370  405957.90970   
5    01:01.4  2.19989  2.22869  2.02009  2.06320  206695.76520  439140.98090   
6    00:00.4  2.06749  2.09989  1.98960  2.00999   66905.13157  136288.85800   
7    58:59.3  2.00209  2.08490  1.94019  2.02000   45139.39373   90964.41366   
8    00:09.3  2.02680  2.13870  2.02660  2.04230   62410.44127  129729.67880   
9    59:08.3  2.04780  2.08020  1.97780  2.02050   31937.20438   65058.00772   
10   00:18.3  2.01989  2.01989  1.90030  1.98889   28180.99227   55263.43977   
11   59:17.2  1.98889  1.98889  1.83899 