In [12]:
import numpy as np
import pandas as pd
import random
from scipy.signal import argrelextrema
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from hmmlearn import hmm
import talib
import re
import os
#from pathlib import Path

# --------------------
# CONFIGURATION
# --------------------
N_MODELS = 3  # Change for desired Monte Carlo iterations
TRAIN_PATH = '20250202-20170908_BTC-USDT_1D.csv'  # Update paths as needed!
NEW_DATA_PATH = '20250127-20170907_XRP-USDT_1H.csv'


In [4]:
# ----------------------------
# MARKET REGIME ANALYZER CLASS
# ----------------------------

class MarketRegimeAnalyzer:
    def __init__(self, ohlc_df, lookback_window=20, volatility_threshold=0.5, chop_threshold=0.5):
        self.df = self.normalize_column_names(ohlc_df.copy())
        self.validate_input_columns()
        self.lookback_window = lookback_window
        self.volatility_threshold = volatility_threshold
        self.chop_threshold = chop_threshold
        self.state_labels = {
            0: 'Rising',
            1: 'Falling',
            2: 'Steady',
            3: 'Choppy',
            4: 'No Label'
        }

    def normalize_column_names(self, df):
        column_mapping = {}
        for col in df.columns:
            normalized = col.strip().lower()
            normalized = re.sub(r'[^a-z0-9]', '', normalized)
            column_mapping[col] = normalized
        required_columns = {
            'open': ['open', 'op', 'o'],
            'high': ['high', 'hi', 'h'],
            'low': ['low', 'lo', 'l'],
            'close': ['close', 'cl', 'c', 'last'],
            'volume': ['volume', 'vol', 'v', 'qty']
        }
        final_mapping = {}
        available_columns = set(column_mapping.values())
        for standard_name, variants in required_columns.items():
            for variant in variants:
                if variant in available_columns:
                    for orig_col, normalized_col in column_mapping.items():
                        if normalized_col == variant:
                            final_mapping[standard_name] = orig_col
                            break
                    break
        df = df.rename(columns={v: k for k, v in final_mapping.items()})
        return df

    def validate_input_columns(self):
        required_columns = ['open', 'high', 'low', 'close', 'volume']
        missing = [col for col in required_columns if col not in self.df.columns]
        if missing:
            raise ValueError(f"Missing required columns after normalization: {missing}")
        for col in required_columns:
            self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
        self.df = self.df.dropna(subset=required_columns)

    def preprocess_data(self):
        self.df['returns'] = self.df['close'].pct_change()
        self.df['volatility'] = self.df['close'].rolling(window=self.lookback_window).std()
        self.df['avg_volatility'] = self.df['volatility'].rolling(window=self.lookback_window).mean()
        self.df['norm_volatility'] = self.df['volatility'] / self.df['avg_volatility']
        self.df['sma'] = talib.SMA(self.df['close'], timeperiod=self.lookback_window)
        self.df['ema'] = talib.EMA(self.df['close'], timeperiod=self.lookback_window)
        self.df['adx'] = talib.ADX(self.df['high'], self.df['low'], self.df['close'], timeperiod=self.lookback_window)
        self.df['rsi'] = talib.RSI(self.df['close'], timeperiod=self.lookback_window)
        self.df['volume_sma'] = talib.SMA(self.df['volume'], timeperiod=self.lookback_window)
        self.df['volume_change'] = self.df['volume'] / self.df['volume_sma']
        self.df['local_min'] = self.df['close'] == self.df['close'].rolling(window=5, center=True).min()
        self.df['local_max'] = self.df['close'] == self.df['close'].rolling(window=5, center=True).max()
        self.df['atr'] = talib.ATR(self.df['high'], self.df['low'], self.df['close'], timeperiod=self.lookback_window)
        self.df['chop'] = 100 * np.log10(self.df['atr'].rolling(window=self.lookback_window).sum() /
                                         (self.df['high'].rolling(window=self.lookback_window).max() -
                                          self.df['low'].rolling(window=self.lookback_window).min())) / np.log10(self.lookback_window)
        self.df = self.df.dropna()

    def label_states(self):
        self.df['state'] = 4  # Default to 'No Label'
        min_indices = argrelextrema(self.df['close'].values, np.less_equal, order=5)[0]
        max_indices = argrelextrema(self.df['close'].values, np.greater_equal, order=5)[0]
        for i in range(len(min_indices)-1):
            start_idx = min_indices[i]
            end_candidates = max_indices[max_indices > start_idx]
            end_idx = end_candidates[0] if len(end_candidates) > 0 else len(self.df)-1
            if all(self.df['close'].iloc[start_idx:end_idx+1] >= self.df['close'].iloc[start_idx]):
                self.df.loc[self.df.index[start_idx]:self.df.index[end_idx],'state'] = 0
        for i in range(len(max_indices)-1):
            start_idx = max_indices[i]
            end_candidates = min_indices[min_indices > start_idx]
            end_idx = end_candidates[0] if len(end_candidates) > 0 else len(self.df)-1
            if all(self.df['close'].iloc[start_idx:end_idx+1] <= self.df['close'].iloc[start_idx]):
                self.df.loc[self.df.index[start_idx]:self.df.index[end_idx],'state'] = 1
        steady_mask = (self.df['norm_volatility'] < self.volatility_threshold) & (self.df['state'] == 4)
        self.df.loc[steady_mask, 'state'] = 2
        chop_mask = (self.df['chop'] > self.chop_threshold) & (self.df['adx'] < 25) & (self.df['state'] == 4)
        self.df.loc[chop_mask, 'state'] = 3

    def prepare_model_data(self):
        feature_cols = ['returns', 'volatility', 'norm_volatility', 'sma', 'ema',
                        'adx', 'rsi', 'volume_change', 'chop']
        for col in feature_cols:
            for lag in range(1, 4):
                self.df[f'{col}_lag{lag}'] = self.df[col].shift(lag)
        for col in feature_cols:
            self.df[f'{col}_ma5'] = self.df[col].rolling(5).mean()
            self.df[f'{col}_ma10'] = self.df[col].rolling(10).mean()
        self.df = self.df.dropna()
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        exclude_cols = ['state', 'local_min', 'local_max']
        model_features = [col for col in numeric_cols if col not in exclude_cols]
        self.X = self.df[model_features]
        self.y = self.df['state']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, shuffle=False)
        self.scaler = StandardScaler()
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)

    def train_models(self):
        self.lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
        self.lr_model.fit(self.X_train_scaled, self.y_train)
        self.rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
        self.rf_model.fit(self.X_train, self.y_train)
        self.nn_model = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu',
                                      solver='adam', max_iter=1000, random_state=42)
        self.nn_model.fit(self.X_train_scaled, self.y_train)

    def analyze_state_transitions(self):
        self.hmm_model = hmm.CategoricalHMM(n_components=5, n_iter=100)
        self.hmm_model.fit(self.y.values.reshape(-1, 1))
        self.transition_matrix = self.hmm_model.transmat_

    def run_analysis(self):
        self.preprocess_data()
        self.label_states()
        self.prepare_model_data()
        self.train_models()
        self.analyze_state_transitions()
        
# --------------------------
# MONTE CARLO TRAINING SETUP
# --------------------------

def random_params():
    lookback_window = random.choice([10, 15, 20, 25, 30])
    volatility_threshold = round(random.uniform(0.3, 0.7), 2)
    chop_threshold = round(random.uniform(0.3, 0.7), 2)
    return lookback_window, volatility_threshold, chop_threshold

def train_models_montecarlo(train_df, n_models=N_MODELS):
    models = []
    transition_matrices = []
    model_params = []

    for i in range(n_models):
        lookback_window, volatility_threshold, chop_threshold = random_params()
        analyzer = MarketRegimeAnalyzer(
            train_df.copy(),
            lookback_window=lookback_window,
            volatility_threshold=volatility_threshold,
            chop_threshold=chop_threshold
        )
        analyzer.run_analysis()
        models.append(analyzer)
        transition_matrices.append(analyzer.transition_matrix)
        model_params.append({
            'lookback_window': lookback_window,
            'volatility_threshold': volatility_threshold,
            'chop_threshold': chop_threshold
        })
        print(f"Model {i+1}: window={lookback_window}, vol={volatility_threshold}, chop={chop_threshold}")

    return models, transition_matrices, model_params        

# --------------------------
# ENSEMBLE APPLICATION BLOCK
# --------------------------

def ensemble_predict(models, new_data_path):
    new_df_raw = pd.read_csv(new_data_path, parse_dates=["timestamp"])
    all_predictions = []
    indices_keep = None

    for i, analyzer in enumerate(models):
        new_df = analyzer.normalize_column_names(new_df_raw.copy())
        analyzer.df = new_df
        analyzer.validate_input_columns()
        analyzer.preprocess_data()

        feature_cols = ['returns', 'volatility', 'norm_volatility', 'sma', 'ema',
                        'adx', 'rsi', 'volume_change', 'chop']
        for col in feature_cols:
            for lag in range(1, 4):
                analyzer.df[f'{col}_lag{lag}'] = analyzer.df[col].shift(lag)
        for col in feature_cols:
            analyzer.df[f'{col}_ma5'] = analyzer.df[col].rolling(5).mean()
            analyzer.df[f'{col}_ma10'] = analyzer.df[col].rolling(10).mean()
        analyzer.df = analyzer.df.dropna()

        numeric_cols = analyzer.df.select_dtypes(include=['float64', 'int64']).columns
        exclude_cols = ['state', 'local_min', 'local_max']
        model_features = [col for col in numeric_cols if col not in exclude_cols]
        X_new = analyzer.df[model_features]
        X_new_scaled = analyzer.scaler.transform(X_new)
        preds = analyzer.rf_model.predict(X_new)  # Use RF by default, can switch to nn_model.predict(X_new_scaled)

        all_predictions.append(preds)
        if indices_keep is None:
            indices_keep = analyzer.df.index

    import numpy as np
    predictions_arr = np.vstack(all_predictions)

    # For each time step, calculate probabilities across models
    prediction_summary = []
    state_labels = models[0].state_labels
    for i in range(predictions_arr.shape[1]):
        state_vec = predictions_arr[:, i]
        state_counts = dict(zip(*np.unique(state_vec, return_counts=True)))
        total = np.sum(list(state_counts.values()))
        regime_probs = {state_labels[s]: state_counts.get(s, 0)/total for s in state_labels}
        prediction_summary.append(regime_probs)

    summary_df = pd.DataFrame(prediction_summary)
    result_df = new_df_raw.loc[indices_keep].reset_index(drop=True)
    final_df = pd.concat([result_df[['timestamp','open','high','low','close','volume']].reset_index(drop=True), summary_df], axis=1)

    return final_df

In [13]:
# ----------
# MAIN BLOCK
# ----------
if __name__ == "__main__":
    # Train ensemble of models
    train_df = pd.read_csv(TRAIN_PATH, parse_dates=["timestamp"])
    models, trans_matrices, params = train_models_montecarlo(train_df, n_models=N_MODELS)

    # Apply all models to unseen data for ensemble prediction output
    final_df = ensemble_predict(models, NEW_DATA_PATH)

    print(final_df.head(30))
    # (Optional) Save results
    
    outname = 'name.csv'
    outdir = './dir'
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    fullname = os.path.join(outdir, outname)    

    final_df.to_csv(fullname, index=False)  
        




Model 1: window=15, vol=0.69, chop=0.45




Model 2: window=30, vol=0.67, chop=0.55




Model 3: window=25, vol=0.52, chop=0.6


  new_df_raw = pd.read_csv(new_data_path, parse_dates=["timestamp"])


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 64290 and the array at index 1 has size 64260