In [None]:
import logging
import sys
import re
import yaml
from pathlib import Path
import subprocess

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("Combined_Model_Experiments")

required_packages = {
    "catboost": "catboost",
    "lightgbm": "lightgbm",
    "xgboost": "xgboost",
    "pandas": "pandas",
    "numpy": "numpy",
    "matplotlib": "matplotlib",
    "seaborn": "seaborn",
    "sklearn": "scikit-learn",
    "IPython": "ipython",
    "yaml": "pyyaml" 
}

for package_name, install_name in required_packages.items():
    try:
        __import__(package_name)
        logger.info(f"Package '{package_name}' is already installed.")
    except ImportError:
        logger.warning(f"Module '{package_name}' not found. Attempting to install '{install_name}'...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", install_name])
            logger.info(f"Package '{install_name}' installed successfully.")
        except Exception as e:
            logger.error(f"Failed to install '{install_name}': {e}")
            logger.error(f"Please install it manually in your environment (e.g., `pip install {install_name}`) and restart the Jupyter kernel.")
            raise

# --- Now, we can safely import everything ---
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from IPython.display import display, Markdown

# --- Project Path Setup ---
PROJECT_ROOT_PATH = Path.cwd().parent
if str(PROJECT_ROOT_PATH) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT_PATH))

from regime_predictor_lib.utils.database_manager import DatabaseManager
from regime_predictor_lib.supervised_learning.models import CatBoostModel, XGBoostModel
from regime_predictor_lib.supervised_learning.evaluation import get_sklearn_classification_report, get_sklearn_confusion_matrix
from regime_predictor_lib.supervised_learning.results.result_saver import ResultSaver
from regime_predictor_lib.supervised_learning.results import plotting_utils
from regime_predictor_lib.supervised_learning.training.trainer import ModelTrainer

plt.style.use("seaborn-v0_8-whitegrid")
sns.set_context("notebook") 
plt.rcParams["figure.figsize"] = (12, 7)
plt.rcParams["figure.dpi"] = 90

DB_PATH = PROJECT_ROOT_PATH / "data" / "db" / "volume" / "quant.db"
DEFAULT_MODEL_PARAMS_PATH = PROJECT_ROOT_PATH / "config" / "supervised_learning" / "default_model_params.yaml"
THEMATIC_FEATURE_LISTS_DIR = PROJECT_ROOT_PATH / "data" / "processed" / "feature_selection" / "thematic_feature_lists"
RESULTS_SUMMARY_PATH = PROJECT_ROOT_PATH / "data" / "reports" / "supervised_learning" / "master_results_summary.csv"
BASE_REPORT_DIR = PROJECT_ROOT_PATH / "data" / "reports" / "supervised_learning" / "combined_models"
BASE_MODEL_DIR = PROJECT_ROOT_PATH / "data" / "models" / "supervised" / "combined_models"

db_manager = DatabaseManager(db_path=DB_PATH)
result_saver = ResultSaver(base_report_dir=BASE_REPORT_DIR, base_model_dir=BASE_MODEL_DIR)

with open(DEFAULT_MODEL_PARAMS_PATH, 'r') as f:
    default_model_params = yaml.safe_load(f)

In [None]:
import logging
import sys
import re
import yaml
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from IPython.display import display, Markdown

PROJECT_ROOT_PATH = Path.cwd().parent
if str(PROJECT_ROOT_PATH) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT_PATH))

from regime_predictor_lib.utils.database_manager import DatabaseManager
from regime_predictor_lib.supervised_learning.models import CatBoostModel, XGBoostModel
from regime_predictor_lib.supervised_learning.evaluation import get_sklearn_classification_report, get_sklearn_confusion_matrix
from regime_predictor_lib.supervised_learning.results.result_saver import ResultSaver
from regime_predictor_lib.supervised_learning.results import plotting_utils
from regime_predictor_lib.supervised_learning.training.trainer import ModelTrainer

plt.style.use("seaborn-v0_8-whitegrid")
sns.set_context("notebook") 
plt.rcParams["figure.figsize"] = (12, 7)
plt.rcParams["figure.dpi"] = 90
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("Combined_Model_Experiments")

DB_PATH = PROJECT_ROOT_PATH / "data" / "db" / "volume" / "quant.db"
DEFAULT_MODEL_PARAMS_PATH = PROJECT_ROOT_PATH / "config" / "supervised_learning" / "default_model_params.yaml"
THEMATIC_FEATURE_LISTS_DIR = PROJECT_ROOT_PATH / "data" / "processed" / "feature_selection" / "thematic_feature_lists"
RESULTS_SUMMARY_PATH = PROJECT_ROOT_PATH / "data" / "reports" / "supervised_learning" / "master_results_summary.csv"
BASE_REPORT_DIR = PROJECT_ROOT_PATH / "data" / "reports" / "supervised_learning" / "combined_models"
BASE_MODEL_DIR = PROJECT_ROOT_PATH / "data" / "models" / "supervised" / "combined_models"

db_manager = DatabaseManager(db_path=DB_PATH)
result_saver = ResultSaver(base_report_dir=BASE_REPORT_DIR, base_model_dir=BASE_MODEL_DIR)

with open(DEFAULT_MODEL_PARAMS_PATH, 'r') as f:
    default_model_params = yaml.safe_load(f)

In [None]:
logger.info("Loading and ranking thematic model results...")
results_df = pd.read_csv(RESULTS_SUMMARY_PATH)
theme_scores = results_df.groupby('theme')['f1_macro_mean'].mean().sort_values(ascending=False)
ranked_themes = theme_scores.index.tolist()
display(Markdown("### Ranked Thematic Models (by F1 Macro Mean)"))
display(theme_scores.to_frame())

In [None]:
def run_experiment_pipeline(top_n_themes: list, model_class, model_name: str, model_params: dict, include_regime_t: bool, shared_label_encoder) -> dict:
    experiment_name = f"Top{len(top_n_themes)}_{'WithRegimeT' if include_regime_t else 'NoRegimeT'}"
    logger.info(f"--- Running pipeline for {model_name} on {experiment_name} ---")
    
    feature_dfs = []
    for theme_name in top_n_themes:
        try:
            theme_table_name = f"theme_{theme_name.split('theme_')[-1]}"
            df_theme = pd.read_sql_table(theme_table_name, db_manager.engine, index_col="date", parse_dates=["date"])
            
            feature_list_path = THEMATIC_FEATURE_LISTS_DIR / f"{theme_table_name}_selected_features.txt"
            with open(feature_list_path, "r") as f:
                features_for_theme = [line.strip() for line in f if line.strip()]
            
            feature_dfs.append(df_theme[features_for_theme])
            logger.debug(f"Loaded {len(features_for_theme)} features for {theme_name}")
        except Exception as e:
            logger.error(f"Could not load data for theme '{theme_name}': {e}")
            continue

    if not feature_dfs:
        logger.error("No feature DataFrames were loaded. Aborting pipeline.")
        return {}
        
    X = pd.concat(feature_dfs, axis=1)
    X = X.loc[:, ~X.columns.duplicated()]
    
    ref_theme_table_name = f"theme_{top_n_themes[0].split('theme_')[-1]}"
    df_ref = pd.read_sql_table(ref_theme_table_name, db_manager.engine, index_col="date", parse_dates=["date"])
    
    y = df_ref['regime_t_plus_6m']
    
    if include_regime_t:
        if 'regime_t' in df_ref.columns:
            X['regime_t'] = df_ref['regime_t']
        else:
            logger.warning("'regime_t' not found in reference table, cannot add as feature.")
    
    common_index = X.index.intersection(y.index).sort_values()
    X, y = X.loc[common_index], y.loc[common_index]
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.ffill(inplace=True)
    X.dropna(axis=1, how='all', inplace=True) 
    X.fillna(0, inplace=True) 
    
    y.dropna(inplace=True)
    final_index = X.index.intersection(y.index)
    X, y = X.loc[final_index], y.loc[final_index]
    
    logger.info(f"Final dataset shape: X={X.shape}, y={y.shape}")
    
    y_encoded = pd.Series(shared_label_encoder.transform(y), index=y.index, name=y.name)
    
    model_wrapper = model_class(model_params=model_params)
    cv_splitter = TimeSeriesSplit(n_splits=5)
    
    trainer = ModelTrainer(
        model_wrapper=model_wrapper, 
        cv_splitter=cv_splitter, 
        scorers={}, 
        result_saver=result_saver,
        theme_name=experiment_name, 
        model_config_name=model_name,
        label_encoder=shared_label_encoder
    )
    
    aggregated_metrics = trainer.run_cross_validation(X, y_encoded, use_class_weights=True)
    
    display(Markdown(f"### Results for {model_name} on {experiment_name}"))
    oof_df = trainer.oof_predictions
    y_true_oof = oof_df['true_label']
    y_pred_oof = oof_df['predicted_label']
    class_names = [f'Regime {c}' for c in shared_label_encoder.classes_]
    
    print("\n--- Classification Report (Out-of-Fold) ---")
    print(get_sklearn_classification_report(y_true_oof, y_pred_oof, target_names=class_names))
    
    cm = get_sklearn_confusion_matrix(y_true_oof, y_pred_oof)
    fig_cm = plotting_utils.plot_confusion_matrix(cm, class_names, f"Confusion Matrix - {model_name} on {experiment_name}", normalize=True)
    plt.show()
    
    final_model = trainer.train_final_model(X, y_encoded)
    feature_importances = final_model.get_feature_importance()
    
    return {
        "metrics": aggregated_metrics,
        "feature_importances": feature_importances,
        "n_features": len(X.columns)
    }


In [None]:
results_without_regime_t = {}
model_types_to_run = ["xgboost", "catboost"]

y_master = pd.read_sql_table("theme_simple_technical_trend_and_momentum_signals", db_manager.engine, index_col="date", parse_dates=["date"])['regime_t_plus_6m'].dropna()
shared_label_encoder = LabelEncoder().fit(y_master.unique())
logger.info(f"Shared LabelEncoder fitted on all potential target values. Classes: {shared_label_encoder.classes_}")

for n in range(2, len(ranked_themes) + 1):
    top_n = ranked_themes[:n]
    key = f"top_{n}"
    results_without_regime_t[key] = {}
    
    for model_name in model_types_to_run:
        results_without_regime_t[key][model_name] = run_experiment_pipeline(
            top_n_themes=top_n,
            model_class=XGBoostModel if model_name == 'xgboost' else CatBoostModel,
            model_name=model_name.capitalize(),
            model_params=default_model_params[model_name],
            include_regime_t=False,
            shared_label_encoder=shared_label_encoder
        )

In [None]:
results_with_regime_t = {}

for n in range(2, len(ranked_themes) + 1):
    top_n = ranked_themes[:n]
    key = f"top_{n}"
    results_with_regime_t[key] = {}
    
    for model_name in model_types_to_run:
        results_with_regime_t[key][model_name] = run_experiment_pipeline(
            top_n_themes=top_n,
            model_class=XGBoostModel if model_name == 'xgboost' else CatBoostModel,
            model_name=model_name.capitalize(),
            model_params=default_model_params[model_name],
            include_regime_t=True,
            shared_label_encoder=shared_label_encoder
        )

In [None]:
def extract_metrics(results_dict, metric_key='f1_macro_mean'):
    data = []
    for top_n_key, model_results in results_dict.items():
        n_themes = int(top_n_key.split('_')[-1])
        for model_name, result in model_results.items():
            if result and 'metrics' in result and metric_key in result['metrics']:
                data.append({
                    'n_themes': n_themes,
                    'model': model_name,
                    'metric_value': result['metrics'][metric_key]
                })
    return pd.DataFrame(data)

metrics_no_regime_t = extract_metrics(results_without_regime_t)
metrics_with_regime_t = extract_metrics(results_with_regime_t)

fig, ax = plt.subplots(figsize=(16, 9))

sns.lineplot(data=metrics_no_regime_t, x='n_themes', y='metric_value', hue='model', ax=ax, marker='o', linestyle='-')
sns.lineplot(data=metrics_with_regime_t, x='n_themes', y='metric_value', hue='model', ax=ax, marker='x', linestyle='--')

ax.set_title('Model Performance vs. Number of Combined Thematic Feature Sets', fontsize=18)
ax.set_xlabel('Number of Top-Ranked Themes Included', fontsize=14)
ax.set_ylabel('Out-of-Fold F1 Macro (Mean)', fontsize=14)
ax.set_xticks(range(2, len(ranked_themes) + 1))
ax.grid(True, which='both', linestyle='--', alpha=0.7)

# Improve legend
handles, labels = ax.get_legend_handles_labels()
new_labels = [
    'XGBoost (No regime_t)', 'CatBoost (No regime_t)',
    'XGBoost (With regime_t)', 'CatBoost (With regime_t)'
]
ax.legend(handles, new_labels, title='Experiment', fontsize=12)

plt.show()

In [None]:
def get_theme_from_feature_name(feature_name):
    match = re.match(r'^([a-z_]+)_', feature_name)
    if match:
        return match.group(1)
    return 'unknown'

all_themes_key = f"top_{len(ranked_themes)}"

for model_name in model_types_to_run:
    if all_themes_key in results_without_regime_t and model_name in results_without_regime_t[all_themes_key]:
        run_results = results_without_regime_t[all_themes_key][model_name]
        if run_results and 'feature_importances' in run_results:
            fi_series = run_results['feature_importances']
            fi_df = fi_series.to_frame(name='importance').reset_index().rename(columns={'index':'feature'})
            fi_df['theme_group'] = fi_df['feature'].apply(get_theme_from_feature_name)
            
            theme_importance = fi_df.groupby('theme_group')['importance'].sum().sort_values(ascending=False)
            theme_importance_pct = (theme_importance / theme_importance.sum()) * 100
            
            fig, ax = plt.subplots(figsize=(14, 8))
            sns.barplot(x=theme_importance_pct.index, y=theme_importance_pct.values, ax=ax)
            ax.set_title(f'Thematic Feature Importance Roll-up for {model_name.capitalize()} (All Features)', fontsize=16)
            ax.set_ylabel('Percentage of Total Importance (%)', fontsize=12)
            ax.set_xlabel('Feature Theme Group', fontsize=12)
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.show()
