In [2]:
import numpy as np
import pandas as pd
import random
from scipy.signal import argrelextrema
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from hmmlearn import hmm
import talib
import re
import os
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
import xgboost
from xgboost import XGBClassifier
from sklearn.svm import SVC
import datetime


#from pathlib import Path



In [3]:
# ------------ CONFIGURABLE PARAMETERS ------------
N_MODELS = 3                                       # Number of MC-sim ensemble models
TRAIN_PATH = '20250202-20170908_BTC-USDT_1D.csv'
NEW_DATA_PATH = '20250127-20170907_XRP-USDT_1H.csv'
DEFAULT_LOOKBACK_WINDOW = 20                        # Used for classic technical/feature rolling
DEFAULT_VOL_WINDOW = 20                             # Volatility rolling window
DEFAULT_CHOP_WINDOW = 20                            # Choppiness rolling window
UPDOWN_PERIOD = 14                                  # For up/down period calculations
UPDOWN_BIN_OFFSET = 1                               # For upbin result
RSI_PERIOD = 14                                     # For custom RSI/gain/loss
MFI_WINDOW = 14                                     # Money Flow Index window
AVG_GAIN_WINDOW = 14                                # For avg gain/loss
LAG_FEATURES = 3                                    # Number of lags to compute for each feature
MA1 = 5                                             # Short moving avg window
MA2 = 10                                            # Long moving avg window


In [4]:
# ------------ FEATURE FUNCTIONS -------------
def get_up_or_down(df, period):
    df['gain_{}'.format(period)] = 0.0
    df['loss_{}'.format(period)] = 0.0
    for i in range(1, len(df)):
        diff = df.iloc[i]['close'] - df.iloc[i-1]['close']
        df.at[df.index[i], 'gain_{}'.format(period)] = max(diff, 0)
        df.at[df.index[i], 'loss_{}'.format(period)] = max(-diff, 0)
    return df

def get_up_or_down_bin(df, offset):
    df['updown_{}'.format(offset)] = 0
    for i in range(offset, len(df)):
        if df.iloc[i]['close'] > df.iloc[i-offset]['close']:
            df.at[df.index[i], 'updown_{}'.format(offset)] = 1
        elif df.iloc[i]['close'] < df.iloc[i-offset]['close']:
            df.at[df.index[i], 'updown_{}'.format(offset)] = -1
    return df

def get_average_gains(df, period):
    df['ag_{}'.format(period)] = 0.0
    df['al_{}'.format(period)] = 0.0
    for i in range(period, len(df)):
        up = df['gain_{}'.format(period)].iloc[i-period+1:i+1].mean()
        down = df['loss_{}'.format(period)].iloc[i-period+1:i+1].mean()
        df.at[df.index[i], 'ag_{}'.format(period)] = up
        df.at[df.index[i], 'al_{}'.format(period)] = down
    return df

def get_relative_strength(df, period):
    df = get_up_or_down(df, period)
    df = get_average_gains(df, period)
    df['rs_{}'.format(period)] = 0.0
    df['rsi_{}'.format(period)] = 0.0
    for i in range(period, len(df)):
        if df.at[df.index[i], 'al_{}'.format(period)] != 0:
            rs = df.at[df.index[i], 'ag_{}'.format(period)] / df.at[df.index[i], 'al_{}'.format(period)]
            df.at[df.index[i], 'rs_{}'.format(period)] = rs
            df.at[df.index[i], 'rsi_{}'.format(period)] = 100 - (100 / (1 + rs))
        else:
            df.at[df.index[i], 'rs_{}'.format(period)] = 0
            df.at[df.index[i], 'rsi_{}'.format(period)] = 100
    return df

def money_flow_index(df, window):
    tp = (df['high'] + df['low'] + df['close']) / 3
    rmf = tp * df['volume']
    pos_mf = []
    neg_mf = []
    for i in range(1, len(df)):
        if tp.iloc[i] > tp.iloc[i-1]:
            pos_mf.append(rmf.iloc[i])
            neg_mf.append(0)
        elif tp.iloc[i] < tp.iloc[i-1]:
            pos_mf.append(0)
            neg_mf.append(rmf.iloc[i])
        else:
            pos_mf.append(0)
            neg_mf.append(0)
    pos_mf = pd.Series([0] + pos_mf, index=df.index)
    neg_mf = pd.Series([0] + neg_mf, index=df.index)
    pos_mf_rolling = pos_mf.rolling(window).sum()
    neg_mf_rolling = neg_mf.rolling(window).sum()
    mfr = pos_mf_rolling / (neg_mf_rolling + 1e-10)
    mfi = 100 - 100 / (1 + mfr)
    df['mfi_{}'.format(window)] = mfi
    return df

def get_ci(df, lookback):
    tr1 = (df['high'] - df['low']).abs()
    tr2 = (df['high'] - df['close'].shift(1)).abs()
    tr3 = (df['low'] - df['close'].shift(1)).abs()
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    atr = tr.rolling(1).mean()
    highh = df['high'].rolling(lookback).max()
    lowl = df['low'].rolling(lookback).min()
    ci = 100 * np.log10((atr.rolling(lookback).sum()) / (highh - lowl)) / np.log10(lookback)
    df['ci_{}'.format(lookback)] = ci
    return df

In [20]:
# ----------------------------
# MARKET REGIME ANALYZER CLASS
# ----------------------------
class MarketRegimeAnalyzer:
    def __init__(self, ohlc_df, lookback_window=20, volatility_threshold=0.5, chop_threshold=0.5):
        self.df = self.normalize_column_names(ohlc_df.copy())
        self.validate_input_columns()
        self.lookback_window = lookback_window
        self.volatility_threshold = volatility_threshold
        self.chop_threshold = chop_threshold
        self.state_labels = {
            0: 'Rising',
            1: 'Falling',
            2: 'Steady',
            3: 'Choppy',
            4: 'No Label'
        }

    def normalize_column_names(self, df):
        column_mapping = {}
        for col in df.columns:
            normalized = col.strip().lower()
            normalized = re.sub(r'[^a-z0-9]', '', normalized)
            column_mapping[col] = normalized
        required_columns = {
            'open': ['open', 'op', 'o'],
            'high': ['high', 'hi', 'h'],
            'low': ['low', 'lo', 'l'],
            'close': ['close', 'cl', 'c', 'last'],
            'volume': ['volume', 'vol', 'v', 'qty']
        }
        final_mapping = {}
        available_columns = set(column_mapping.values())
        for standard_name, variants in required_columns.items():
            for variant in variants:
                if variant in available_columns:
                    for orig_col, normalized_col in column_mapping.items():
                        if normalized_col == variant:
                            final_mapping[standard_name] = orig_col
                            break
                    break
        df = df.rename(columns={v: k for k, v in final_mapping.items()})
        return df

    def validate_input_columns(self):
        required_columns = ['open', 'high', 'low', 'close', 'volume']
        missing = [col for col in required_columns if col not in self.df.columns]
        if missing:
            raise ValueError(f"Missing required columns after normalization: {missing}")
        for col in required_columns:
            self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
        self.df = self.df.dropna(subset=required_columns)

    def preprocess_data(self):
        self.df['returns'] = self.df['close'].pct_change()
        self.df['volatility'] = self.df['close'].rolling(window=self.lookback_window).std()
        self.df['avg_volatility'] = self.df['volatility'].rolling(window=self.lookback_window).mean()
        self.df['norm_volatility'] = self.df['volatility'] / self.df['avg_volatility']
        self.df['sma'] = talib.SMA(self.df['close'], timeperiod=self.lookback_window)
        self.df['ema'] = talib.EMA(self.df['close'], timeperiod=self.lookback_window)
        self.df['adx'] = talib.ADX(self.df['high'], self.df['low'], self.df['close'], timeperiod=self.lookback_window)
        self.df['rsi'] = talib.RSI(self.df['close'], timeperiod=self.lookback_window)
        self.df['volume_sma'] = talib.SMA(self.df['volume'], timeperiod=self.lookback_window)
        self.df['volume_change'] = self.df['volume'] / self.df['volume_sma']
        self.df['local_min'] = self.df['close'] == self.df['close'].rolling(window=5, center=True).min()
        self.df['local_max'] = self.df['close'] == self.df['close'].rolling(window=5, center=True).max()
        self.df['atr'] = talib.ATR(self.df['high'], self.df['low'], self.df['close'], timeperiod=self.lookback_window)
        # Choppiness index
        self.df['chop'] = 100 * np.log10(self.df['atr'].rolling(window=self.lookback_window).sum() /
              (self.df['high'].rolling(window=self.lookback_window).max() -
               self.df['low'].rolling(window=self.lookback_window).min())) / np.log10(self.lookback_window)
        self.df = self.df.dropna()

    def label_states(self):
        self.df['state'] = 4 # Default to 'No Label'
        min_indices = argrelextrema(self.df['close'].values, np.less_equal, order=5)[0]
        max_indices = argrelextrema(self.df['close'].values, np.greater_equal, order=5)[0]
        for i in range(len(min_indices)-1):
            start_idx = min_indices[i]
            end_candidates = max_indices[max_indices > start_idx]
            end_idx = end_candidates[0] if len(end_candidates) > 0 else len(self.df)-1
            if all(self.df['close'].iloc[start_idx:end_idx+1] >= self.df['close'].iloc[start_idx]):
                self.df.loc[self.df.index[start_idx]:self.df.index[end_idx],'state'] = 0
        for i in range(len(max_indices)-1):
            start_idx = max_indices[i]
            end_candidates = min_indices[min_indices > start_idx]
            end_idx = end_candidates[0] if len(end_candidates) > 0 else len(self.df)-1
            if all(self.df['close'].iloc[start_idx:end_idx+1] <= self.df['close'].iloc[start_idx]):
                self.df.loc[self.df.index[start_idx]:self.df.index[end_idx],'state'] = 1
        steady_mask = (self.df['norm_volatility'] < self.volatility_threshold) & (self.df['state'] == 4)
        self.df.loc[steady_mask, 'state'] = 2
        chop_mask = (self.df['chop'] > self.chop_threshold) & (self.df['adx'] < 25) & (self.df['state'] == 4)
        self.df.loc[chop_mask, 'state'] = 3

    def prepare_model_data(self):
        # Expanded feature list (using parameter variables)
        feature_cols = [
            'returns','volatility','norm_volatility','sma','ema','adx','rsi','volume_change','chop',
            f'gain_{UPDOWN_PERIOD}',f'loss_{UPDOWN_PERIOD}',f'updown_{UPDOWN_BIN_OFFSET}',
            f'ag_{AVG_GAIN_WINDOW}',f'al_{AVG_GAIN_WINDOW}',f'rs_{RSI_PERIOD}',f'rsi_{RSI_PERIOD}',
            f'mfi_{MFI_WINDOW}',f'ci_{self.lookback_window}'
        ]
        # Generate lags and smoothers
        for col in feature_cols:
            if col in self.df.columns:
                for lag in range(1, LAG_FEATURES + 1):
                    self.df[f'{col}_lag{lag}'] = self.df[col].shift(lag)
                self.df[f'{col}_ma{MA1}'] = self.df[col].rolling(MA1).mean()
                self.df[f'{col}_ma{MA2}'] = self.df[col].rolling(MA2).mean()
        self.df = self.df.dropna()
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        exclude_cols = ['state', 'local_min', 'local_max']
        model_features = [col for col in numeric_cols if col not in exclude_cols]
        self.X = self.df[model_features]
        self.y = self.df['state']
        self.model_features = model_features 

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, shuffle=False)
        self.scaler = StandardScaler()
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)

    def train_models(self):
        """Train logistic regression, random forest, neural network, XGBoost, and SVM models. Return all models and their test set metrics."""
        label_keys = list(self.state_labels.keys())
        label_names = list(self.state_labels.values())

        # Logistic Regression
        self.lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
        self.lr_model.fit(self.X_train_scaled, self.y_train)
        lr_pred = self.lr_model.predict(self.X_test_scaled)
        lr_report_str = classification_report(
            self.y_test, lr_pred, labels=label_keys, target_names=label_names, zero_division=0
        )
        lr_report_dict = classification_report(
            self.y_test, lr_pred, labels=label_keys, target_names=label_names, output_dict=True, zero_division=0
        )
        print("Logistic Regression Performance:")
        print(lr_report_str)

        # Random Forest
        self.rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
        self.rf_model.fit(self.X_train, self.y_train)
        rf_pred = self.rf_model.predict(self.X_test)
        rf_report_str = classification_report(
            self.y_test, rf_pred, labels=label_keys, target_names=label_names, zero_division=0
        )
        rf_report_dict = classification_report(
            self.y_test, rf_pred, labels=label_keys, target_names=label_names, output_dict=True, zero_division=0
        )
        print("\nRandom Forest Performance:")
        print(rf_report_str)

        # Neural Network
        self.nn_model = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu',
                                      solver='adam', max_iter=1000, random_state=42)
        self.nn_model.fit(self.X_train_scaled, self.y_train)
        nn_pred = self.nn_model.predict(self.X_test_scaled)
        nn_report_str = classification_report(
            self.y_test, nn_pred, labels=label_keys, target_names=label_names, zero_division=0
        )
        nn_report_dict = classification_report(
            self.y_test, nn_pred, labels=label_keys, target_names=label_names, output_dict=True, zero_division=0
        )
        print("\nNeural Network Performance:")
        print(nn_report_str)

        # XGBoost/GBM
        self.xgb_model = XGBClassifier(objective='multi:softmax', num_class=len(self.state_labels),
                                       eval_metric='mlogloss', use_label_encoder=False, verbosity=0, random_state=42)
        self.xgb_model.fit(self.X_train, self.y_train)
        xgb_pred = self.xgb_model.predict(self.X_test)
        xgb_report_str = classification_report(
            self.y_test, xgb_pred, labels=label_keys, target_names=label_names, zero_division=0
        )
        xgb_report_dict = classification_report(
            self.y_test, xgb_pred, labels=label_keys, target_names=label_names, output_dict=True, zero_division=0
        )
        print("\nXGBoost Performance:")
        print(xgb_report_str)

        # SVM (on scaled data, RBF kernel recommended for non-linearities)
        self.svm_model = SVC(kernel='rbf', probability=True, random_state=42)
        self.svm_model.fit(self.X_train_scaled, self.y_train)
        svm_pred = self.svm_model.predict(self.X_test_scaled)
        svm_report_str = classification_report(
            self.y_test, svm_pred, labels=label_keys, target_names=label_names, zero_division=0
        )
        svm_report_dict = classification_report(
            self.y_test, svm_pred, labels=label_keys, target_names=label_names, output_dict=True, zero_division=0
        )
        print("\nSVM (RBF) Performance:")
        print(svm_report_str)

        self.train_stacking_ensemble(label_keys, label_names)

        # Return all models and metrics
        return {
            "models": {
                "lr": self.lr_model,
                "rf": self.rf_model,
                "nn": self.nn_model,
                "xgb": self.xgb_model,
                "svm": self.svm_model,
            },
            "metrics": {
                "lr": {
                    "classification_report_str": lr_report_str,
                    "classification_report_dict": lr_report_dict,
                },
                "rf": {
                    "classification_report_str": rf_report_str,
                    "classification_report_dict": rf_report_dict,
                },
                "nn": {
                    "classification_report_str": nn_report_str,
                    "classification_report_dict": nn_report_dict,
                },
                "xgb": {
                    "classification_report_str": xgb_report_str,
                    "classification_report_dict": xgb_report_dict,
                },
                "svm": {
                    "classification_report_str": svm_report_str,
                    "classification_report_dict": svm_report_dict,
                },
            }
        }

    def train_stacking_ensemble(self, label_keys, label_names):
        base_estimators = [
            ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),
            ('lr', LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)),
            ('mlp', MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', max_iter=1000, random_state=42)),
            ('xgb', XGBClassifier(objective='multi:softmax', num_class=len(self.state_labels),
                                  eval_metric='mlogloss', use_label_encoder=False, verbosity=0, random_state=42)),
            ('svm', SVC(kernel='rbf', probability=True, random_state=42))
        ]
        meta_learner = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
        self.stacking_model = StackingClassifier(
            estimators=base_estimators,
            final_estimator=meta_learner,
            cv=5,
            passthrough=False,
            n_jobs=-1
        )
        self.stacking_model.fit(self.X_train_scaled, self.y_train)
        stacking_pred = self.stacking_model.predict(self.X_test_scaled)
        print("\nSTACKING ENSEMBLE PERFORMANCE:")
        # Always specify both labels and target_names (EXPLICIT FIX for your ValueError)
        print(classification_report(
            self.y_test, stacking_pred, labels=label_keys, target_names=label_names, zero_division=0
        )) 

    def analyze_state_transitions(self):
        self.hmm_model = hmm.CategoricalHMM(n_components=5, n_iter=100)
        self.hmm_model.fit(self.y.values.reshape(-1, 1))
        self.transition_matrix = self.hmm_model.transmat_

    def run_analysis(self):
        self.preprocess_data()
        self.label_states()
        self.prepare_model_data()
        self.train_models()
        self.analyze_state_transitions()

# --------------------------
# MONTE CARLO TRAINING SETUP
# --------------------------
def random_params():
    lookback_window = random.choice([10, 15, 20, 25, 30])
    volatility_threshold = round(random.uniform(0.3, 0.7), 2)
    chop_threshold = round(random.uniform(0.3, 0.7), 2)
    return lookback_window, volatility_threshold, chop_threshold

def train_models_montecarlo(train_df, n_models=N_MODELS):
    models = []
    transition_matrices = []
    model_params = []

    for i in range(n_models):
        lookback_window, volatility_threshold, chop_threshold = random_params()
        analyzer = MarketRegimeAnalyzer(
            train_df.copy(),
            lookback_window=lookback_window,
            volatility_threshold=volatility_threshold,
            chop_threshold=chop_threshold
        )
        analyzer.run_analysis()
        models.append(analyzer)
        transition_matrices.append(analyzer.transition_matrix)
        model_params.append({
            'lookback_window': lookback_window,
            'volatility_threshold': volatility_threshold,
            'chop_threshold': chop_threshold
        })
        print(f"Model {i+1}: window={lookback_window}, vol={volatility_threshold}, chop={chop_threshold}")

    return models, transition_matrices, model_params 

# --------------------------
# ENSEMBLE APPLICATION BLOCK
# --------------------------
def ensemble_predict(models, new_data_path):
    new_df_raw = pd.read_csv(new_data_path, parse_dates=["timestamp"])
    # Store each model's predictions
    rf_preds, lr_preds, mlp_preds, xgb_preds, svm_preds = [], [], [], [], []
    stacking_predictions = []
    all_indices = []

    for analyzer in models:
        new_df = analyzer.normalize_column_names(new_df_raw.copy())
        analyzer.df = new_df
        analyzer.validate_input_columns()
        analyzer.preprocess_data()

        feature_cols = ['returns', 'volatility', 'norm_volatility', 'sma', 'ema',
                        'adx', 'rsi', 'volume_change', 'chop']
        for col in feature_cols:
            for lag in range(1, 4):
                analyzer.df[f'{col}_lag{lag}'] = analyzer.df[col].shift(lag)
        for col in feature_cols:
            analyzer.df[f'{col}_ma5'] = analyzer.df[col].rolling(5).mean()
            analyzer.df[f'{col}_ma10'] = analyzer.df[col].rolling(10).mean()
        analyzer.df = analyzer.df.dropna()

        X_new = analyzer.df[analyzer.model_features]
        X_new_scaled = analyzer.scaler.transform(X_new)

        # Each model prediction
        rf_preds.append(analyzer.rf_model.predict(X_new))
        lr_preds.append(analyzer.lr_model.predict(X_new_scaled))
        mlp_preds.append(analyzer.nn_model.predict(X_new_scaled))
        xgb_preds.append(analyzer.xgb_model.predict(X_new))
        svm_preds.append(analyzer.svm_model.predict(X_new_scaled))
        all_indices.append(analyzer.df.index)

        # stacking ensemble (just once, e.g. from the first analyzer)
        if hasattr(analyzer, 'stacking_model') and len(stacking_predictions) == 0:
            stacking_predictions = analyzer.stacking_model.predict(X_new_scaled)

    # Use only the first model (or average/vote if you wish), here taking from the first ensemble member:
    shared_indices = list(sorted(set.intersection(*[set(idx) for idx in all_indices])))
    idx_map_rf = dict(zip(all_indices[0], rf_preds[0]))
    idx_map_lr = dict(zip(all_indices[0], lr_preds[0]))
    idx_map_mlp = dict(zip(all_indices[0], mlp_preds[0]))
    idx_map_xgb = dict(zip(all_indices[0], xgb_preds[0]))
    idx_map_svm = dict(zip(all_indices[0], svm_preds[0]))

    rf_final = [idx_map_rf[i] for i in shared_indices]
    lr_final = [idx_map_lr[i] for i in shared_indices]
    mlp_final = [idx_map_mlp[i] for i in shared_indices]
    xgb_final = [idx_map_xgb[i] for i in shared_indices]
    svm_final = [idx_map_svm[i] for i in shared_indices]

    if stacking_predictions is not None and len(stacking_predictions):
        idx_map_stack = dict(zip(all_indices[0], stacking_predictions))
        stack_final = [idx_map_stack[i] for i in shared_indices]
    else:
        stack_final = None

    result_df = new_df_raw.loc[shared_indices].reset_index(drop=True)
    # Add prediction columns to output
    result_df['RF_Prediction'] = rf_final
    result_df['LR_Prediction'] = lr_final
    result_df['MLP_Prediction'] = mlp_final
    result_df['XGB_Prediction'] = xgb_final
    result_df['SVM_Prediction'] = svm_final
    if stack_final is not None:
        result_df['StackingPrediction'] = stack_final

    return result_df

def markov_transition_matrix(labels):
    """
    Empirical (count-based) transition probability matrix for a sequence of labels.
    """
    states = pd.Series(labels).unique()
    matrix = pd.DataFrame(0, index=states, columns=states, dtype=float)
    for (a, b) in zip(labels[:-1], labels[1:]):
        if (a in states) and (b in states):
            matrix.loc[a, b] += 1
    matrix = matrix.div(matrix.sum(axis=1), axis=0)
    return matrix

In [21]:
# ----------
# MAIN BLOCK
# ----------
if __name__ == "__main__":
    # Train ensemble of models
    train_df = pd.read_csv(TRAIN_PATH, parse_dates=["timestamp"])
    models, trans_matrices, params = train_models_montecarlo(train_df, n_models=N_MODELS)

    # Take the transition matrix from the first model for demonstration
    analyzer = models[0]  # e.g., "best" or first model
    print("\nHMM State Transition Matrix:")
    hmm_mat = pd.DataFrame(analyzer.transition_matrix,
                           index=list(analyzer.state_labels.values()),
                           columns=list(analyzer.state_labels.values()))
    print(hmm_mat.round(3))

    print("\nEmpirical Transition Matrix (markov_transition_matrix):")
    emp_mat = markov_transition_matrix(analyzer.y.values)
    emp_mat.index = [analyzer.state_labels.get(x, x) for x in emp_mat.index]
    emp_mat.columns = [analyzer.state_labels.get(x, x) for x in emp_mat.columns]
    print(emp_mat.round(3))    
    
    # Apply all models to unseen data for ensemble prediction output
    final_df = ensemble_predict(models, NEW_DATA_PATH)

    print(final_df.head(30))
    # (Optional) Save results
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    outname = f"markov_labeled_result_{timestamp}.csv"
    outdir = './dir'
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    fullname = os.path.join(outdir, outname)    

    final_df.to_csv(fullname, index=False)  
        




Logistic Regression Performance:
              precision    recall  f1-score   support

      Rising       0.62      0.23      0.34       210
     Falling       0.51      0.91      0.66       228
      Steady       0.00      0.00      0.00         7
      Choppy       0.00      0.00      0.00        34
    No Label       0.51      0.42      0.46        48

    accuracy                           0.53       527
   macro avg       0.33      0.31      0.29       527
weighted avg       0.52      0.53      0.46       527


Random Forest Performance:
              precision    recall  f1-score   support

      Rising       0.61      0.26      0.36       210
     Falling       0.48      0.93      0.63       228
      Steady       0.00      0.00      0.00         7
      Choppy       1.00      0.03      0.06        34
    No Label       0.00      0.00      0.00        48

    accuracy                           0.50       527
   macro avg       0.42      0.24      0.21       527
weighted avg    




STACKING ENSEMBLE PERFORMANCE:
              precision    recall  f1-score   support

      Rising       0.71      0.34      0.46       210
     Falling       0.49      0.91      0.63       228
      Steady       0.00      0.00      0.00         7
      Choppy       0.00      0.00      0.00        34
    No Label       0.00      0.00      0.00        48

    accuracy                           0.53       527
   macro avg       0.24      0.25      0.22       527
weighted avg       0.49      0.53      0.46       527

Model 1: window=15, vol=0.41, chop=0.41




Logistic Regression Performance:
              precision    recall  f1-score   support

      Rising       0.61      0.22      0.33       210
     Falling       0.52      0.90      0.66       230
      Steady       0.35      0.44      0.39        16
      Choppy       0.06      0.06      0.06        16
    No Label       0.46      0.11      0.17        57

    accuracy                           0.51       529
   macro avg       0.40      0.35      0.32       529
weighted avg       0.53      0.51      0.45       529


Random Forest Performance:
              precision    recall  f1-score   support

      Rising       0.61      0.38      0.46       210
     Falling       0.52      0.89      0.65       230
      Steady       0.00      0.00      0.00        16
      Choppy       0.00      0.00      0.00        16
    No Label       0.00      0.00      0.00        57

    accuracy                           0.54       529
   macro avg       0.22      0.25      0.22       529
weighted avg    




STACKING ENSEMBLE PERFORMANCE:
              precision    recall  f1-score   support

      Rising       0.59      0.45      0.51       210
     Falling       0.51      0.81      0.63       230
      Steady       1.00      0.12      0.22        16
      Choppy       0.00      0.00      0.00        16
    No Label       0.00      0.00      0.00        57

    accuracy                           0.54       529
   macro avg       0.42      0.28      0.27       529
weighted avg       0.49      0.54      0.48       529

Model 2: window=10, vol=0.58, chop=0.68




Logistic Regression Performance:
              precision    recall  f1-score   support

      Rising       0.62      0.23      0.34       210
     Falling       0.51      0.91      0.66       228
      Steady       0.00      0.00      0.00         2
      Choppy       0.25      0.03      0.05        36
    No Label       0.51      0.39      0.44        51

    accuracy                           0.53       527
   macro avg       0.38      0.31      0.30       527
weighted avg       0.54      0.53      0.47       527


Random Forest Performance:
              precision    recall  f1-score   support

      Rising       0.58      0.25      0.35       210
     Falling       0.49      0.93      0.64       228
      Steady       0.00      0.00      0.00         2
      Choppy       1.00      0.03      0.05        36
    No Label       0.00      0.00      0.00        51

    accuracy                           0.50       527
   macro avg       0.41      0.24      0.21       527
weighted avg    




STACKING ENSEMBLE PERFORMANCE:
              precision    recall  f1-score   support

      Rising       0.66      0.34      0.45       210
     Falling       0.49      0.89      0.63       228
      Steady       0.00      0.00      0.00         2
      Choppy       0.00      0.00      0.00        36
    No Label       0.00      0.00      0.00        51

    accuracy                           0.52       527
   macro avg       0.23      0.25      0.22       527
weighted avg       0.47      0.52      0.45       527

Model 3: window=15, vol=0.33, chop=0.66

HMM State Transition Matrix:
          Rising  Falling  Steady  Choppy  No Label
Rising     0.779    0.000   0.001   0.012     0.208
Falling    0.000    0.838   0.134   0.016     0.013
Steady     0.200    0.000   0.791   0.009     0.000
Choppy     0.000    0.000   0.040   0.923     0.037
No Label   0.000    0.955   0.037   0.007     0.001

Empirical Transition Matrix (markov_transition_matrix):
          Rising  Falling  No Label  Cho

  new_df_raw = pd.read_csv(new_data_path, parse_dates=["timestamp"])


   timestamp     open     high      low    close         volume   
0    59:54.8  1.98290  1.98290  1.95999  1.96989     842.147582  \
1    01:04.8  1.95999  1.98690  1.94989  1.96490   20430.103160   
2    00:03.7  1.96189  2.02850  1.95510  2.01350   35270.421280   
3    59:02.7  2.00999  2.08000  2.00999  2.06000   34362.915080   
4    00:12.7  2.05900  2.10000  1.92999  2.06510   98407.296190   
5    59:11.6  2.06510  2.09100  2.05719  2.08429  102571.693700   
6    00:21.6  2.08000  2.08740  2.02309  2.05869  128110.300800   
7    59:20.6  2.05060  2.06099  2.04170  2.04300   79484.232720   
8    00:30.6  2.04610  2.05410  2.02009  2.04919   66181.477600   
9    59:29.5  2.03239  2.04939  2.01020  2.03549   43167.292400   
10   00:39.6  2.03549  2.10000  2.02510  2.08199   30226.535200   
11   59:38.5  2.08199  2.10000  2.05010  2.08000   44310.176800   
12   00:48.5  2.07549  2.08000  2.04459  2.05759   22563.294760   
13   59:47.5  2.05140  2.07880  2.04510  2.06199   13757.50224