# Advanced Models: LightGBM & XGBoost for Stock Price Prediction
## Notebook 05 - Advanced ML Models

### This notebook implements and evaluates advanced machine learning models 
### for multi-horizon stock price prediction, building upon baseline models.

### SETUP AND IMPORTS

In [2]:
# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import logging
from typing import Dict, List, Tuple, Any
import time
import joblib
from datetime import datetime, timedelta

# Machine Learning
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats


# Project modules - using existing foundation
import sys
sys.path.append('..')

from src.data import (
    load_config, setup_logging, load_raw_data, 
    clean_data, validate_data_quality
)
from src.features import (
    create_all_features, apply_configured_features,
    validate_features, scale_features
)
from src.models import (
    LightGBMPredictor, XGBPredictor, tune_lightgbm, tune_xgboost,
    train_models_multi_horizon, evaluate_with_walk_forward,
    save_model_records
)
from src.evaluate import (
    walk_forward_validation, compare_models, 
    statistical_significance_test, create_evaluation_report,
    calculate_rmse, calculate_mae, calculate_mape, 
    calculate_directional_accuracy, calculate_within_tolerance
)
from src.utils import (
    ensure_dir_exists, save_results_to_json, 
    plot_model_comparison, plot_feature_importance,
    timer_decorator, get_project_root
)

# Configure display and warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
plt.style.use('default')
warnings.filterwarnings('ignore', category=UserWarning)
sns.set_palette("husl")

print("✅ All imports successful!")
print(f"📊 LightGBM version: {lgb.__version__}")
print(f"🚀 XGBoost version: {xgb.__version__}")



✅ All imports successful!
📊 LightGBM version: 4.6.0
🚀 XGBoost version: 2.1.4



### 1- CONFIGURATION AND SETUP


In [None]:
# Load configuration and setup logging
config = load_config()
logger = setup_logging()
logger.info("Starting advanced models notebook")

# Extract key parameters
DATA_DIR = Path(config['data']['raw_data_dir'])
PROCESSED_DIR = Path(config['data']['processed_data_dir'])
MODELS_DIR = Path(config['paths']['models_dir'])
FIGURES_DIR = Path(config['paths']['figures_dir'])
RESULTS_DIR = Path(config['paths']['results_dir'])

# Ensure directories exist
for dir_path in [PROCESSED_DIR, MODELS_DIR, FIGURES_DIR, RESULTS_DIR]:
    ensure_dir_exists(dir_path)

# Model parameters
PREDICTION_HORIZONS = config['models']['prediction_horizons']
TEST_SIZE = config['validation']['test_size']
N_SPLITS = config['validation']['n_splits']
RANDOM_STATE = config.get('random_state', 42)

print(f"📁 Data directory: {DATA_DIR}")
print(f"🎯 Prediction horizons: {PREDICTION_HORIZONS}")
print(f"✅ Configuration loaded successfully!")


### 2- DATA LOADING AND PREPARATION


In [None]:
def load_and_prepare_data(ticker: str = 'AAPL') -> pd.DataFrame:
    """Load and prepare data with features for modeling."""
    logger.info(f"Loading and preparing data for {ticker}")
    
    # Load raw data
    file_path = DATA_DIR / f"{ticker}.csv"
    if not file_path.exists():
        raise FileNotFoundError(f"Data file not found: {file_path}")
    
    df = load_raw_data(file_path)
    logger.info(f"Loaded {len(df)} rows of data for {ticker}")
    
    # Clean data
    df_clean = clean_data(df)
    
    # Validate data quality
    is_valid = validate_data_quality(df_clean, detailed=True)
    if not is_valid:
        logger.warning("Data quality issues detected")
    
    # Create features using configured pipeline
    df_features = apply_configured_features(df_clean, config)
    logger.info(f"Created features: {df_features.shape}")
    
    # Validate features
    validation_report = validate_features(df_features)
    logger.info(f"Feature validation: {validation_report}")
    
    return df_features

# Load sample data for demonstration
print("📊 Loading sample data...")
try:
    # Use AAPL as primary example
    sample_data = load_and_prepare_data('AAPL')
    print(f"✅ Data loaded: {sample_data.shape}")
    print(f"📅 Date range: {sample_data.index.min()} to {sample_data.index.max()}")
    
    # Display basic info
    print("\n📋 Feature columns:")
    feature_cols = [col for col in sample_data.columns if not col.startswith('target_')]
    target_cols = [col for col in sample_data.columns if col.startswith('target_')]
    print(f"Features: {len(feature_cols)} columns")
    print(f"Targets: {target_cols}")
    
except Exception as e:
    logger.error(f"Error loading data: {e}")
    print(f"❌ Error loading data: {e}")


### 3- ADVANCED MODELS IMPLEMENTATION


In [None]:
class AdvancedModelTrainer:
    """Trainer for advanced ML models with hyperparameter optimization."""
    
    def __init__(self, config: Dict[str, Any], random_state: int = 42):
        self.config = config
        self.random_state = random_state
        self.models = {}
        self.results = {}
        self.feature_importance = {}
        
    def prepare_model_data(self, data: pd.DataFrame, horizon: int) -> Tuple[pd.DataFrame, pd.Series]:
        """Prepare features and targets for specific prediction horizon."""
        target_col = f'target_{horizon}d'
        
        if target_col not in data.columns:
            raise ValueError(f"Target column {target_col} not found in data")
        
        # Get feature columns (exclude targets and any metadata)
        feature_cols = [col for col in data.columns 
                       if not col.startswith('target_') 
                       and col not in ['symbol', 'sector', 'industry']]
        
        X = data[feature_cols].copy()
        y = data[target_col].copy()
        
        # Remove rows with NaN targets
        valid_mask = ~y.isna()
        X = X.loc[valid_mask]
        y = y.loc[valid_mask]
        
        # Handle remaining NaN in features
        X = X.fillna(method='ffill').fillna(method='bfill')
        
        return X, y
    
    @timer_decorator
    def train_lightgbm_model(self, data: pd.DataFrame, horizon: int, 
                           optimize_params: bool = True) -> Dict[str, Any]:
        """Train LightGBM model for specific horizon."""
        logger.info(f"Training LightGBM model for {horizon}-day horizon")
        
        X, y = self.prepare_model_data(data, horizon)
        
        if optimize_params:
            # Use hyperparameter tuning from models.py
            best_params = tune_lightgbm(X, y, n_trials=50, random_state=self.random_state)
            logger.info(f"Best LightGBM params for {horizon}d: {best_params}")
        else:
            # Use default parameters
            best_params = self.config['models']['lightgbm']['params']
        
        # Create and train model
        model = LightGBMPredictor(**best_params)
        
        # Evaluate with walk-forward validation
        results = evaluate_with_walk_forward(
            model, X, y, 
            test_size=TEST_SIZE,
            n_splits=N_SPLITS
        )
        
        # Store results
        model_key = f'lightgbm_{horizon}d'
        self.models[model_key] = model
        self.results[model_key] = results
        
        # Extract feature importance
        if hasattr(model, 'feature_importance_'):
            importance_df = pd.DataFrame({
                'feature': X.columns,
                'importance': model.feature_importance_
            }).sort_values('importance', ascending=False)
            self.feature_importance[model_key] = importance_df
        
        logger.info(f"✅ LightGBM {horizon}d training completed")
        return results
    
    @timer_decorator
    def train_xgboost_model(self, data: pd.DataFrame, horizon: int, 
                          optimize_params: bool = True) -> Dict[str, Any]:
        """Train XGBoost model for specific horizon."""
        logger.info(f"Training XGBoost model for {horizon}-day horizon")
        
        X, y = self.prepare_model_data(data, horizon)
        
        if optimize_params:
            # Use hyperparameter tuning from models.py
            best_params = tune_xgboost(X, y, n_trials=50, random_state=self.random_state)
            logger.info(f"Best XGBoost params for {horizon}d: {best_params}")
        else:
            # Use default parameters
            best_params = self.config['models']['xgboost']['params']
        
        # Create and train model
        model = XGBPredictor(**best_params)
        
        # Evaluate with walk-forward validation
        results = evaluate_with_walk_forward(
            model, X, y,
            test_size=TEST_SIZE,
            n_splits=N_SPLITS
        )
        
        # Store results
        model_key = f'xgboost_{horizon}d'
        self.models[model_key] = model
        self.results[model_key] = results
        
        # Extract feature importance
        if hasattr(model, 'feature_importances_'):
            importance_df = pd.DataFrame({
                'feature': X.columns,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)
            self.feature_importance[model_key] = importance_df
        
        logger.info(f"✅ XGBoost {horizon}d training completed")
        return results
    
    def train_all_models(self, data: pd.DataFrame, optimize: bool = True):
        """Train all models for all prediction horizons."""
        logger.info("Starting comprehensive model training")
        
        for horizon in PREDICTION_HORIZONS:
            print(f"\n🎯 Training models for {horizon}-day horizon...")
            
            try:
                # Train LightGBM
                lgb_results = self.train_lightgbm_model(data, horizon, optimize)
                print(f"✅ LightGBM {horizon}d - RMSE: {lgb_results.get('test_rmse', 'N/A'):.4f}")
                
                # Train XGBoost
                xgb_results = self.train_xgboost_model(data, horizon, optimize)
                print(f"✅ XGBoost {horizon}d - RMSE: {xgb_results.get('test_rmse', 'N/A'):.4f}")
                
            except Exception as e:
                logger.error(f"Error training models for {horizon}d: {e}")
                print(f"❌ Error training {horizon}d models: {e}")
        
        logger.info("Model training completed")

# Initialize trainer
trainer = AdvancedModelTrainer(config, RANDOM_STATE)
print("🚀 Advanced model trainer initialized")


### 4- MODEL TRAINING AND OPTIMIZATION


In [None]:
print("\n" + "="*60)
print("🎯 STARTING ADVANCED MODEL TRAINING")
print("="*60)

# Train models (set optimize=False for faster execution during development)
OPTIMIZE_HYPERPARAMS = True  # Set to False for quick testing

try:
    trainer.train_all_models(sample_data, optimize=OPTIMIZE_HYPERPARAMS)
    print(f"\n✅ Training completed for {len(trainer.models)} models")
    
except Exception as e:
    logger.error(f"Error in model training: {e}")
    print(f"❌ Training error: {e}")


### 5- RESULTS ANALYSIS AND COMPARISON


In [None]:
def analyze_model_performance(trainer: AdvancedModelTrainer) -> pd.DataFrame:
    """Analyze and compare model performance across horizons."""
    
    if not trainer.results:
        print("❌ No model results available")
        return pd.DataFrame()
    
    # Compile results
    performance_data = []
    
    for model_name, results in trainer.results.items():
        model_type = model_name.split('_')[0]
        horizon = model_name.split('_')[1]
        
        performance_data.append({
            'Model': model_type.upper(),
            'Horizon': horizon,
            'RMSE': results.get('test_rmse', np.nan),
            'MAE': results.get('test_mae', np.nan),
            'MAPE': results.get('test_mape', np.nan),
            'R2': results.get('test_r2', np.nan),
            'Directional_Accuracy': results.get('test_directional_accuracy', np.nan),
            'Hit_Rate_5pct': results.get('test_hit_rate', np.nan)
        })
    
    performance_df = pd.DataFrame(performance_data)
    
    # Display results
    print("\n📊 MODEL PERFORMANCE SUMMARY")
    print("-" * 50)
    print(performance_df.to_string(index=False, float_format='%.4f'))
    
    return performance_df

# Analyze performance
if trainer.results:
    performance_summary = analyze_model_performance(trainer)
else:
    print("⚠️ No results to analyze - models may not have trained successfully")

### 6- FEATURE IMPORTANCE ANALYSIS

In [None]:
def analyze_feature_importance(trainer: AdvancedModelTrainer, top_n: int = 20):
    """Analyze and visualize feature importance across models."""
    
    if not trainer.feature_importance:
        print("❌ No feature importance data available")
        return
    
    print(f"\n🔍 TOP {top_n} FEATURE IMPORTANCE ANALYSIS")
    print("-" * 50)
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    axes = axes.ravel()
    
    plot_idx = 0
    
    for model_name, importance_df in trainer.feature_importance.items():
        if plot_idx >= len(axes):
            break
            
        # Get top features
        top_features = importance_df.head(top_n)
        
        # Plot
        ax = axes[plot_idx]
        bars = ax.barh(range(len(top_features)), top_features['importance'])
        ax.set_yticks(range(len(top_features)))
        ax.set_yticklabels(top_features['feature'])
        ax.set_xlabel('Importance')
        ax.set_title(f'{model_name.upper()} - Top {top_n} Features')
        ax.invert_yaxis()
        
        # Color gradient
        colors = plt.cm.viridis(np.linspace(0, 1, len(top_features)))
        for bar, color in zip(bars, colors):
            bar.set_color(color)
        
        plot_idx += 1
    
    # Remove empty subplots
    for idx in range(plot_idx, len(axes)):
        fig.delaxes(axes[idx])
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / 'feature_importance_advanced_models.png', 
                dpi=300, bbox_inches='tight')
    plt.show()
    
    # Print top features for each model
    for model_name, importance_df in trainer.feature_importance.items():
        print(f"\n{model_name.upper()} - Top 10 Features:")
        print(importance_df.head(10)[['feature', 'importance']].to_string(index=False))

# Analyze feature importance
if trainer.feature_importance:
    analyze_feature_importance(trainer, top_n=15)

### 7- MODEL COMPARISON AND STATISTICAL TESTS


In [None]:
def compare_with_baseline_models(advanced_results: Dict[str, Any]) -> pd.DataFrame:
    """Compare advanced models with baseline models from previous notebook."""
    
    print("\n📈 ADVANCED VS BASELINE COMPARISON")
    print("-" * 40)
    
    # Try to load baseline results
    baseline_file = RESULTS_DIR / 'baseline_model_results.json'
    
    if baseline_file.exists():
        try:
            baseline_results = pd.read_json(baseline_file)
            print("✅ Loaded baseline model results")
            
            # Create comparison
            comparison_data = []
            
            for horizon in PREDICTION_HORIZONS:
                horizon_str = f"{horizon}d"
                
                # Advanced models
                lgb_key = f'lightgbm_{horizon_str}'
                xgb_key = f'xgboost_{horizon_str}'
                
                if lgb_key in advanced_results and xgb_key in advanced_results:
                    lgb_rmse = advanced_results[lgb_key].get('test_rmse', np.nan)
                    xgb_rmse = advanced_results[xgb_key].get('test_rmse', np.nan)
                    
                    comparison_data.append({
                        'Horizon': horizon_str,
                        'LightGBM_RMSE': lgb_rmse,
                        'XGBoost_RMSE': xgb_rmse,
                        'Best_Advanced': min(lgb_rmse, xgb_rmse) if not (np.isnan(lgb_rmse) or np.isnan(xgb_rmse)) else np.nan
                    })
            
            comparison_df = pd.DataFrame(comparison_data)
            print(comparison_df.to_string(index=False, float_format='%.4f'))
            
            return comparison_df
            
        except Exception as e:
            print(f"⚠️ Error loading baseline results: {e}")
    
    else:
        print("⚠️ Baseline results not found - run notebook 04 first")
    
    return pd.DataFrame()

# Compare models
if trainer.results:
    model_comparison = compare_with_baseline_models(trainer.results)

### 8- PREDICTION VISUALIZATION


In [None]:
def visualize_predictions(trainer: AdvancedModelTrainer, data: pd.DataFrame, 
                        horizon: int = 7, sample_size: int = 100):
    """Visualize model predictions vs actual values."""
    
    model_lgb = trainer.models.get(f'lightgbm_{horizon}d')
    model_xgb = trainer.models.get(f'xgboost_{horizon}d')
    
    if not (model_lgb and model_xgb):
        print(f"❌ Models not available for {horizon}d horizon")
        return
    
    # Prepare data
    X, y = trainer.prepare_model_data(data, horizon)
    
    # Get recent sample
    X_sample = X.tail(sample_size)
    y_sample = y.tail(sample_size)
    
    # Make predictions
    try:
        pred_lgb = model_lgb.predict(X_sample)
        pred_xgb = model_xgb.predict(X_sample)
        
        # Create visualization
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Predictions vs Actual
        axes[0, 0].scatter(y_sample, pred_lgb, alpha=0.6, label='LightGBM')
        axes[0, 0].plot([y_sample.min(), y_sample.max()], 
                        [y_sample.min(), y_sample.max()], 'r--', alpha=0.8)
        axes[0, 0].set_xlabel('Actual')
        axes[0, 0].set_ylabel('Predicted')
        axes[0, 0].set_title(f'LightGBM - {horizon}d Predictions vs Actual')
        axes[0, 0].legend()
        
        axes[0, 1].scatter(y_sample, pred_xgb, alpha=0.6, label='XGBoost', color='orange')
        axes[0, 1].plot([y_sample.min(), y_sample.max()], 
                        [y_sample.min(), y_sample.max()], 'r--', alpha=0.8)
        axes[0, 1].set_xlabel('Actual')
        axes[0, 1].set_ylabel('Predicted')
        axes[0, 1].set_title(f'XGBoost - {horizon}d Predictions vs Actual')
        axes[0, 1].legend()
        
        # Time series plots
        dates = X_sample.index
        axes[1, 0].plot(dates, y_sample, label='Actual', color='black', linewidth=2)
        axes[1, 0].plot(dates, pred_lgb, label='LightGBM', alpha=0.8)
        axes[1, 0].set_title(f'LightGBM - {horizon}d Time Series')
        axes[1, 0].legend()
        axes[1, 0].tick_params(axis='x', rotation=45)
        
        axes[1, 1].plot(dates, y_sample, label='Actual', color='black', linewidth=2)
        axes[1, 1].plot(dates, pred_xgb, label='XGBoost', alpha=0.8, color='orange')
        axes[1, 1].set_title(f'XGBoost - {horizon}d Time Series')
        axes[1, 1].legend()
        axes[1, 1].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.savefig(FIGURES_DIR / f'advanced_models_predictions_{horizon}d.png', 
                   dpi=300, bbox_inches='tight')
        plt.show()
        
        # Calculate and display metrics
        lgb_rmse = calculate_rmse(y_sample, pred_lgb)
        xgb_rmse = calculate_rmse(y_sample, pred_xgb)
        lgb_mae = calculate_mae(y_sample, pred_lgb)
        xgb_mae = calculate_mae(y_sample, pred_xgb)
        
        print(f"\n📊 PREDICTION QUALITY - {horizon}d Horizon (Last {sample_size} samples)")
        print(f"LightGBM - RMSE: {lgb_rmse:.4f}, MAE: {lgb_mae:.4f}")
        print(f"XGBoost  - RMSE: {xgb_rmse:.4f}, MAE: {xgb_mae:.4f}")
        
    except Exception as e:
        logger.error(f"Error in prediction visualization: {e}")
        print(f"❌ Visualization error: {e}")

# Visualize predictions for 7-day horizon
print("\n📊 VISUALIZING PREDICTIONS")
if trainer.models:
    visualize_predictions(trainer, sample_data, horizon=7, sample_size=60)

### 9- MODEL PERSISTENCE


In [None]:
def save_advanced_models(trainer: AdvancedModelTrainer):
    """Save trained models and results."""
    
    print("\n💾 SAVING MODELS AND RESULTS")
    print("-" * 30)
    
    try:
        # Save model results
        results_file = RESULTS_DIR / 'advanced_model_results.json'
        save_results_to_json(trainer.results, results_file)
        print(f"✅ Results saved to {results_file}")
        
        # Save models using joblib
        models_saved = 0
        for model_name, model in trainer.models.items():
            model_file = MODELS_DIR / f"{model_name}.joblib"
            joblib.dump(model, model_file)
            models_saved += 1
        
        print(f"✅ Saved {models_saved} models to {MODELS_DIR}")
        
        # Save feature importance
        if trainer.feature_importance:
            importance_file = RESULTS_DIR / 'feature_importance_advanced.json'
            importance_dict = {}
            for model_name, df in trainer.feature_importance.items():
                importance_dict[model_name] = df.to_dict('records')
            
            save_results_to_json(importance_dict, importance_file)
            print(f"✅ Feature importance saved to {importance_file}")
        
        # Create model records for tracking
        model_records = []
        for model_name, model in trainer.models.items():
            model_type = model_name.split('_')[0]
            horizon = model_name.split('_')[1]
            
            record = {
                'model_name': model_name,
                'model_type': model_type,
                'horizon': horizon,
                'created_at': datetime.now().isoformat(),
                'model_file': f"{model_name}.joblib",
                'performance': trainer.results.get(model_name, {})
            }
            model_records.append(record)
        
        records_file = RESULTS_DIR / 'advanced_model_records.json'
        save_results_to_json(model_records, records_file)
        print(f"✅ Model records saved to {records_file}")
        
    except Exception as e:
        logger.error(f"Error saving models: {e}")
        print(f"❌ Save error: {e}")

# Save models and results
if trainer.models and trainer.results:
    save_advanced_models(trainer)

### 10- SUMMARY AND NEXT STEPS


In [None]:
print("\n" + "="*60)
print("📋 ADVANCED MODELS TRAINING SUMMARY")
print("="*60)

if trainer.results:
    print(f"✅ Models trained: {len(trainer.models)}")
    print(f"✅ Horizons covered: {PREDICTION_HORIZONS}")
    print(f"✅ Results generated: {len(trainer.results)}")
    
    # Best performing models
    print("\n🏆 BEST PERFORMING MODELS BY HORIZON:")
    print("-" * 40)
    
    for horizon in PREDICTION_HORIZONS:
        horizon_str = f"{horizon}d"
        lgb_key = f'lightgbm_{horizon_str}'
        xgb_key = f'xgboost_{horizon_str}'
        
        lgb_rmse = trainer.results.get(lgb_key, {}).get('test_rmse', float('inf'))
        xgb_rmse = trainer.results.get(xgb_key, {}).get('test_rmse', float('inf'))
        
        if lgb_rmse < xgb_rmse and lgb_rmse != float('inf'):
            best_model = "LightGBM"
            best_rmse = lgb_rmse
        elif xgb_rmse != float('inf'):
            best_model = "XGBoost"
            best_rmse = xgb_rmse
        else:
            best_model = "N/A"
            best_rmse = "N/A"
        
        print(f"{horizon_str:>3} horizon: {best_model:>8} (RMSE: {best_rmse})")
    
else:
    print("⚠️ No models were successfully trained")

print(f"\n📁 Results saved to: {RESULTS_DIR}")
print(f"📁 Models saved to: {MODELS_DIR}")
print(f"📁 Figures saved to: {FIGURES_DIR}")

print("\n🚀 NEXT STEPS:")
print("1. Run notebook 06 for LSTM deep learning models (optional)")
print("2. Run notebook 07 for comprehensive evaluation and backtesting")
print("3. Use trained models in the Streamlit app")

print(f"\n✅ Advanced models notebook completed successfully!")
print(f"⏰ Execution time: {datetime.now()}")

### 11- OPTIONAL: QUICK MODEL TESTING


In [None]:
def quick_model_test():
    """Quick test of trained models with recent data."""
    
    if not trainer.models:
        print("⚠️ No models available for testing")
        return
    
    print("\n🧪 QUICK MODEL TEST")
    print("-" * 20)
    
    try:
        # Get recent data for testing
        test_data = sample_data.tail(10)
        
        for model_name, model in trainer.models.items():
            horizon = int(model_name.split('_')[1].replace('d', ''))
            X, y = trainer.prepare_model_data(sample_data, horizon)
            
            # Make prediction on last available data point
            last_features = X.tail(1)
            if not last_features.empty:
                prediction = model.predict(last_features)[0]
                print(f"{model_name:>15}: {prediction:>8.4f}")
    
    except Exception as e:
        print(f"❌ Test error: {e}")

# Run quick test
# quick_model_test()

print("\n" + "="*60)
print("🎉 NOTEBOOK EXECUTION COMPLETED")
print("="*60)