In [None]:

import logging
import sys
from pathlib import Path
import subprocess

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("HPO_Analysis_Notebook")

PROJECT_ROOT_PATH = Path.cwd().parent
if str(PROJECT_ROOT_PATH / 'src') not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT_PATH / 'src'))

required_packages = {
    "pandas": "pandas",
    "numpy": "numpy",
    "matplotlib": "matplotlib",
    "seaborn": "seaborn",
    "yaml": "pyyaml",
    "optuna": "optuna",
    "joblib": "joblib",
    "plotly": "plotly" 
}

for package_name, install_name in required_packages.items():
    try:
        __import__(package_name)
        logger.info(f"Package '{package_name}' is already installed.")
    except ImportError:
        logger.warning(f"Module '{package_name}' not found. Attempting to install '{install_name}'...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", install_name])
            logger.info(f"Package '{install_name}' installed successfully.")
        except Exception as e:
            logger.error(f"Failed to install '{install_name}': {e}")
            logger.error(f"Please install it manually in your environment (e.g., `pip install {install_name}`) and restart the Jupyter kernel.")
            raise

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import optuna
import joblib

HPO_RESULTS_DIR = PROJECT_ROOT_PATH / "data" / "hpo_results"
HPO_SEARCH_SPACE_PATH = PROJECT_ROOT_PATH / "config" / "supervised_learning" / "hpo_search_space.yaml"

sns.set_theme(style="whitegrid", palette="viridis")
plt.rcParams['figure.dpi'] = 100

print("\nSetup complete. All packages are installed and paths are configured.")

In [None]:
def load_hpo_results(hpo_dir: Path) -> pd.DataFrame:
    all_results = []
    if not hpo_dir.exists():
        print(f"Warning: HPO results directory not found at {hpo_dir}")
        return pd.DataFrame()
    
    for json_file in hpo_dir.glob("*.json"):
        try:
            parts = json_file.stem.split('_')
            model_type = parts[-3] 
            theme_name = "_".join(parts[:-3])
            
            with open(json_file, 'r') as f:
                params = json.load(f)
            
            result = {
                "theme": theme_name,
                "model_type": model_type,
                **params
            }
            all_results.append(result)
        except (IndexError, json.JSONDecodeError) as e:
            print(f"Could not process file {json_file.name}: {e}")
            continue
            
    if not all_results:
        return pd.DataFrame()
    
    return pd.DataFrame(all_results)

hpo_results_df = load_hpo_results(HPO_RESULTS_DIR)

with open(HPO_SEARCH_SPACE_PATH, 'r') as f:
    hpo_search_spaces = yaml.safe_load(f)

print(f"Loaded {len(hpo_results_df)} HPO results.")
hpo_results_df.head()

In [None]:
def plot_optimization_histories_for_model(model_type: str):
    
    study_files = list(HPO_RESULTS_DIR.glob(f"*_{model_type}_study.pkl"))
    if not study_files:
        print(f"No study .pkl files found for model type '{model_type}'.")
        return

    fig, ax = plt.subplots(figsize=(14, 8))

    for study_file in study_files:
        study = joblib.load(study_file)
        theme_name = study_file.stem.split(f'_{model_type}_study')[0]
        
        trial_numbers = [t.number for t in study.trials]
        best_values = [t.user_attrs.get('best_value_so_far', t.value) for t in study.trials] 
        
        values = [t.value for t in study.trials]
        best_values_so_far = np.minimum.accumulate(values)
        
        ax.plot(trial_numbers, best_values_so_far, label=theme_name, alpha=0.7, lw=2)

    ax.set_title(f'Optimization History for {model_type.upper()} Models', fontsize=16)
    ax.set_xlabel('Trial Number', fontsize=12)
    ax.set_ylabel('Best Objective Value (LogLoss) So Far', fontsize=12)
    ax.legend(title='Thematic Pipeline', bbox_to_anchor=(1.02, 1), loc='upper left')
    ax.grid(True, which='both', linestyle='--')
    plt.tight_layout()
    plt.show()

plot_optimization_histories_for_model('xgboost')
plot_optimization_histories_for_model('lightgbm_dart')
plot_optimization_histories_for_model('catboost')

In [None]:
def plot_hyperparameter_distributions(model_type: str):
    
    if model_type not in hpo_results_df['model_type'].unique():
        print(f"No HPO results found for model type: {model_type}")
        return

    model_df = hpo_results_df[hpo_results_df['model_type'] == model_type]
    
    params_to_plot = list(hpo_search_spaces.get(model_type, {}).keys())
    if not params_to_plot:
        print(f"No hyperparameters defined in search space for {model_type}")
        return
    
    categorical_params = [p for p in params_to_plot if hpo_search_spaces[model_type][p]['type'] == 'categorical']
    numerical_params = [p for p in params_to_plot if p not in categorical_params]
    
    if numerical_params:
        num_plots = len(numerical_params)
        num_cols = 3
        num_rows = (num_plots - 1) // num_cols + 1
        fig, axes = plt.subplots(num_rows, num_cols, figsize=(16, 4 * num_rows), squeeze=False)
        axes = axes.flatten()
        
        fig.suptitle(f'Distribution of Tuned Numerical Hyperparameters for {model_type.upper()}', fontsize=18, y=1.02)
        
        for i, param in enumerate(numerical_params):
            if param in model_df.columns:
                sns.boxplot(x=model_df[param], ax=axes[i])
                sns.stripplot(x=model_df[param], ax=axes[i], color='black', alpha=0.5, size=5)
                axes[i].set_title(f'{param}', fontsize=12)
                axes[i].set_xlabel('Value')
        
        for j in range(i + 1, len(axes)):
            fig.delaxes(axes[j])
            
        plt.tight_layout(rect=[0, 0, 1, 0.96])
        plt.show()
        
    if categorical_params:
        num_plots = len(categorical_params)
        fig, axes = plt.subplots(1, num_plots, figsize=(6 * num_plots, 5), squeeze=False)
        axes = axes.flatten()
        fig.suptitle(f'Distribution of Tuned Categorical Hyperparameters for {model_type.upper()}', fontsize=18, y=1.02)
        
        for i, param in enumerate(categorical_params):
            if param in model_df.columns:
                sns.countplot(y=model_df[param], ax=axes[i], palette='crest')
                axes[i].set_title(f'{param}', fontsize=12)
                axes[i].set_xlabel('Count')
                axes[i].set_ylabel('')
                
        plt.tight_layout(rect=[0, 0, 1, 0.94])
        plt.show()

for model_t in hpo_results_df['model_type'].unique():
    plot_hyperparameter_distributions(model_t)

In [None]:
def run_advanced_hpo_analysis(model_type: str):
    study_files = list(HPO_RESULTS_DIR.glob(f"*_{model_type}_study.pkl"))
    if not study_files:
        print(f"Skipping advanced analysis for '{model_type}': No study .pkl files found.")
        return
    
    first_study_path = study_files[0]
    study = joblib.load(first_study_path)
    theme_name = first_study_path.stem.split(f'_{model_type}_study')[0]
    
    print(f"\n--- Advanced Analysis for {model_type.upper()} (Example from theme: {theme_name}) ---")

    try:
        fig_imp = optuna.visualization.plot_param_importances(study)
        fig_imp.update_layout(title=f'Hyperparameter Importance for {model_type}')
        fig_imp.show()
    except Exception as e:
        print(f"Could not generate importance plot: {e}")

    try:
        fig_pc = optuna.visualization.plot_parallel_coordinate(study)
        fig_pc.update_layout(title=f'Parallel Coordinate Plot for {model_type}')
        fig_pc.show()
    except Exception as e:
        print(f"Could not generate parallel coordinate plot: {e}")
        
    try:
        most_important_param = study.best_params.keys()
        if most_important_param:
            param_to_slice = list(most_important_param)[0]
            fig_slice = optuna.visualization.plot_slice(study, params=[param_to_slice])
            fig_slice.update_layout(title=f'Slice Plot for `{param_to_slice}` in {model_type}')
            fig_slice.show()
    except Exception as e:
        print(f"Could not generate slice plot: {e}")


for model_t in hpo_results_df['model_type'].unique():
    run_advanced_hpo_analysis(model_t)