In [None]:
# ============================================================================
# CELL 1: Import Dependencies
# ============================================================================

import numpy as np
import pandas as pd
import mysql.connector
from datetime import datetime
import logging
import pickle
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

print("✓ Dependencies loaded")

In [None]:
# ============================================================================
# CELL 2: Database Configuration
# ============================================================================

db_config = {
    'host': '127.0.0.1',
    'user': 'root',
    'password': '',
    'database': 'trading_system'
}

print("✓ Database configuration ready")

In [None]:
# ============================================================================
# CELL 3: Ensemble Model Class
# ============================================================================

class EnsembleModels:
    def __init__(self, db_config, model_save_dir='models/ensemble_models'):
        self.db_config = db_config
        self.model_save_dir = model_save_dir
        self.setup_logging()
        self.create_model_directory()
        
    def setup_logging(self):
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('ensemble_models.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
    
    def create_model_directory(self):
        if not os.path.exists(self.model_save_dir):
            os.makedirs(self.model_save_dir)
            self.logger.info(f"Created ensemble model directory: {self.model_save_dir}")
    
    def connect_db(self):
        try:
            conn = mysql.connector.connect(**self.db_config)
            return conn
        except mysql.connector.Error as e:
            self.logger.error(f"Database connection error: {e}")
            return None
    
    def calculate_metrics(self, y_true, y_pred):
        """Calculate performance metrics"""
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae = mean_absolute_error(y_true, y_pred)
        
        try:
            r2 = r2_score(y_true, y_pred)
        except:
            r2 = -999
        
        # Directional accuracy
        true_direction = y_true > 0
        pred_direction = y_pred > 0
        dir_acc = (np.sum(true_direction == pred_direction) / len(true_direction)) * 100
        
        return {
            'rmse': rmse,
            'mae': mae,
            'r2_score': r2,
            'directional_accuracy': dir_acc
        }
    
    def load_model_predictions(self, symbol):
        """Load predictions from all trained models for a stock"""
        conn = self.connect_db()
        if not conn:
            return None
        
        # Get all model predictions from database
        query = """
        SELECT 
            model_type,
            rmse,
            mae,
            directional_accuracy
        FROM model_performance
        WHERE symbol = %s
        ORDER BY directional_accuracy DESC
        """
        
        cursor = conn.cursor(dictionary=True)
        cursor.execute(query, (symbol,))
        model_info = cursor.fetchall()
        cursor.close()
        conn.close()
        
        if not model_info:
            self.logger.warning(f"No model performance data found for {symbol}")
            return None
        
        return model_info
    
    def simple_average_ensemble(self, predictions_dict):
        """
        Method 1: Simple Average Ensemble
        Average predictions from all models equally
        """
        predictions = list(predictions_dict.values())
        ensemble_pred = np.mean(predictions, axis=0)
        return ensemble_pred
    
    def weighted_average_ensemble(self, predictions_dict, weights_dict):
        """
        Method 2: Weighted Average Ensemble
        Weight predictions by directional accuracy
        """
        ensemble_pred = np.zeros_like(list(predictions_dict.values())[0])
        total_weight = sum(weights_dict.values())
        
        for model_name, predictions in predictions_dict.items():
            weight = weights_dict.get(model_name, 0)
            ensemble_pred += (predictions * weight)
        
        ensemble_pred /= total_weight
        return ensemble_pred
    
    def top_k_ensemble(self, predictions_dict, weights_dict, k=3):
        """
        Method 3: Top-K Ensemble
        Use only top K best performing models
        """
        # Sort models by weight (directional accuracy)
        sorted_models = sorted(weights_dict.items(), key=lambda x: x[1], reverse=True)
        top_k_models = dict(sorted_models[:k])
        
        # Average predictions from top K models
        ensemble_pred = np.zeros_like(list(predictions_dict.values())[0])
        total_weight = sum(top_k_models.values())
        
        for model_name, weight in top_k_models.items():
            if model_name in predictions_dict:
                ensemble_pred += (predictions_dict[model_name] * weight)
        
        ensemble_pred /= total_weight
        return ensemble_pred
    
    def median_ensemble(self, predictions_dict):
        """
        Method 4: Median Ensemble
        Use median of all predictions (robust to outliers)
        """
        predictions = np.array(list(predictions_dict.values()))
        ensemble_pred = np.median(predictions, axis=0)
        return ensemble_pred
    
    def create_ensemble_predictions(self, symbol, enhanced_results):
        """
        Create ensemble predictions using all four methods
        enhanced_results: dict with model results from enhanced training
        """
        if not enhanced_results:
            self.logger.warning(f"No enhanced results for {symbol}")
            return None
        
        # Extract predictions and weights
        predictions_dict = {}
        weights_dict = {}
        
        for model_name, result_data in enhanced_results.items():
            predictions_dict[model_name] = result_data['predictions']
            weights_dict[model_name] = result_data['metrics']['directional_accuracy']
        
        # Get y_test (same for all models)
        y_test = list(enhanced_results.values())[0].get('y_test')
        if y_test is None:
            self.logger.error(f"No y_test data found for {symbol}")
            return None
        
        ensemble_results = {}
        
        # 1. Simple Average Ensemble
        simple_pred = self.simple_average_ensemble(predictions_dict)
        simple_metrics = self.calculate_metrics(y_test, simple_pred)
        ensemble_results['Simple_Average'] = {
            'predictions': simple_pred,
            'metrics': simple_metrics
        }
        
        # 2. Weighted Average Ensemble
        weighted_pred = self.weighted_average_ensemble(predictions_dict, weights_dict)
        weighted_metrics = self.calculate_metrics(y_test, weighted_pred)
        ensemble_results['Weighted_Average'] = {
            'predictions': weighted_pred,
            'metrics': weighted_metrics
        }
        
        # 3. Top-3 Ensemble
        top3_pred = self.top_k_ensemble(predictions_dict, weights_dict, k=3)
        top3_metrics = self.calculate_metrics(y_test, top3_pred)
        ensemble_results['Top3_Models'] = {
            'predictions': top3_pred,
            'metrics': top3_metrics
        }
        
        # 4. Median Ensemble
        median_pred = self.median_ensemble(predictions_dict)
        median_metrics = self.calculate_metrics(y_test, median_pred)
        ensemble_results['Median'] = {
            'predictions': median_pred,
            'metrics': median_metrics
        }
        
        return ensemble_results
    
    def save_ensemble_performance(self, symbol, ensemble_type, metrics, 
                                  train_samples, test_samples, model_list, has_sentiment):
        """Save ensemble performance to database"""
        conn = self.connect_db()
        if not conn:
            return False
        
        cursor = conn.cursor()
        
        model_type_label = f"Ensemble_{ensemble_type}_{'Sentiment' if has_sentiment else 'NoSentiment'}"
        model_path = f"{self.model_save_dir}/{symbol}_{ensemble_type}.pkl"
        
        query = """
        INSERT INTO model_performance 
        (symbol, model_type, rmse, mae, r2_score, directional_accuracy,
         train_samples, test_samples, feature_count, model_path)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """
        
        values = (
            symbol, model_type_label,
            metrics['rmse'], metrics['mae'], metrics['r2_score'],
            metrics['directional_accuracy'],
            train_samples, test_samples, 0, model_path
        )
        
        try:
            cursor.execute(query, values)
            conn.commit()
            return True
        except mysql.connector.Error as e:
            self.logger.error(f"Error saving ensemble performance: {e}")
            return False
        finally:
            cursor.close()
            conn.close()
    
    def save_ensemble_data(self, symbol, ensemble_results):
        """Save ensemble predictions to file"""
        filename = f"{symbol}_ensemble.pkl"
        filepath = os.path.join(self.model_save_dir, filename)
        
        with open(filepath, 'wb') as f:
            pickle.dump(ensemble_results, f)
        
        return filepath
    
    def train_all_ensembles(self, symbol, enhanced_results, train_samples, 
                           test_samples, has_sentiment):
        """Train all four ensemble methods for a stock"""
        self.logger.info(f"Creating ensemble models for {symbol}...")
        
        ensemble_results = self.create_ensemble_predictions(symbol, enhanced_results)
        
        if not ensemble_results:
            return None
        
        # Get model list
        model_list = list(enhanced_results.keys())
        
        # Save performance for each ensemble method
        for ensemble_type, result in ensemble_results.items():
            self.save_ensemble_performance(
                symbol, ensemble_type, result['metrics'],
                train_samples, test_samples, model_list, has_sentiment
            )
        
        # Save ensemble data to file
        self.save_ensemble_data(symbol, ensemble_results)
        
        return ensemble_results

In [None]:
# ============================================================================
# CELL 4: Initialize Ensemble Trainer
# ============================================================================

print("\n" + "="*70)
print("INITIALIZING ENSEMBLE MODEL TRAINER")
print("="*70)

ensemble_trainer = EnsembleModels(db_config)
print("✓ Ensemble trainer initialized")
print("✓ Will create 4 ensemble methods:")
print("  1. Simple Average - Equal weight to all models")
print("  2. Weighted Average - Weight by directional accuracy")
print("  3. Top-3 Models - Use only best 3 models")
print("  4. Median - Robust to outliers")

In [None]:
# ============================================================================
# CELL 5: Load Enhanced Model Results
# ============================================================================

print("\n" + "="*70)
print("LOADING ENHANCED MODEL RESULTS")
print("="*70)

# Load the enhanced stock data with predictions
try:
    with open('enhanced_stock_data.pkl', 'rb') as f:
        enhanced_stock_data = pickle.load(f)
    print(f"✓ Loaded data for {len(enhanced_stock_data)} stocks")
except FileNotFoundError:
    print("✗ enhanced_stock_data.pkl not found!")
    enhanced_stock_data = {}
    
def load_all_model_predictions(symbol, stock_data):
    """Load predictions from all trained models for ensemble"""
    import joblib
    
    model_dir = 'enhanced_models'
    predictions = {}
    
    # Model types to load
    model_types = ['RandomForest', 'XGBoost', 'MLP', 'Lasso', 'Ridge', 'ARIMA']
    
    X_test = stock_data['X_test']
    y_test = stock_data['y_test']
    
    for model_type in model_types:
        try:
            model_path = f"{model_dir}/{symbol}_{model_type}_enhanced.pkl"
            if os.path.exists(model_path):
                model = joblib.load(model_path)
                
                # Make predictions
                if model_type == 'ARIMA':
                    # ARIMA predictions are different
                    try:
                        y_pred = model.forecast(steps=len(y_test))
                    except:
                        continue
                else:
                    y_pred = model.predict(X_test)
                
                # Calculate metrics
                from sklearn.metrics import mean_squared_error, mean_absolute_error
                rmse = np.sqrt(mean_squared_error(y_test, y_pred))
                mae = mean_absolute_error(y_test, y_pred)
                
                # Directional accuracy
                true_direction = y_test > 0
                pred_direction = y_pred > 0
                dir_acc = (np.sum(true_direction == pred_direction) / len(true_direction)) * 100
                
                predictions[model_type] = {
                    'predictions': y_pred,
                    'metrics': {
                        'rmse': rmse,
                        'mae': mae,
                        'directional_accuracy': dir_acc
                    },
                    'y_test': y_test
                }
                
        except Exception as e:
            print(f"  Warning: Could not load {model_type} for {symbol}: {str(e)[:50]}")
            continue
    
    return predictions if predictions else None

print("✓ Helper function to load model predictions ready")    

In [None]:
# ============================================================================
# CELL 6: Test Ensemble on Single Stock
# ============================================================================

if enhanced_stock_data:
    test_symbol = list(enhanced_stock_data.keys())[0]
    
    print("\n" + "="*70)
    print(f"TESTING ENSEMBLE METHODS ON {test_symbol}")
    print("="*70)
    
    # Load model predictions
    print(f"\nLoading trained models for {test_symbol}...")
    model_predictions = load_all_model_predictions(test_symbol, enhanced_stock_data[test_symbol])
    
    if model_predictions:
        print(f"✓ Loaded {len(model_predictions)} models")
        print(f"  Models: {list(model_predictions.keys())}")
        
        # Create ensembles
        print("\nCreating ensemble predictions...")
        ensemble_results = ensemble_trainer.create_ensemble_predictions(
            test_symbol, 
            model_predictions
        )
        
        if ensemble_results:
            print("\n" + "-"*70)
            print("ENSEMBLE PERFORMANCE COMPARISON")
            print("-"*70)
            print(f"{'Method':<20} {'RMSE (%)':<12} {'MAE (%)':<12} {'Dir Acc'}")
            print("-"*70)
            
            # Show individual models first
            print("\nIndividual Models:")
            for model_name, data in model_predictions.items():
                metrics = data['metrics']
                print(f"{model_name:<20} {metrics['rmse']:>8.3f}%    "
                      f"{metrics['mae']:>8.3f}%    {metrics['directional_accuracy']:>8.1f}%")
            
            # Show ensemble results
            print("\nEnsemble Methods:")
            for ensemble_name, data in ensemble_results.items():
                metrics = data['metrics']
                print(f"{ensemble_name:<20} {metrics['rmse']:>8.3f}%    "
                      f"{metrics['mae']:>8.3f}%    {metrics['directional_accuracy']:>8.1f}%")
            
            # Find best method
            all_methods = {**model_predictions, **ensemble_results}
            best_method = max(all_methods.items(), 
                            key=lambda x: x[1]['metrics']['directional_accuracy'])
            
            print("\n" + "="*70)
            print(f"✓ Best method: {best_method[0]}")
            print(f"  Directional Accuracy: {best_method[1]['metrics']['directional_accuracy']:.2f}%")
            print(f"  RMSE: {best_method[1]['metrics']['rmse']:.3f}%")
        else:
            print("✗ Failed to create ensemble predictions")
    else:
        print(f"✗ No model predictions available for {test_symbol}")
else:
    print("\n✗ No stock data loaded")


In [None]:
# ============================================================================
# CELL 7: Train Ensembles for All Stocks
# ============================================================================

if enhanced_stock_data:
    print("\n" + "="*70)
    print("TRAINING ENSEMBLE MODELS FOR ALL STOCKS")
    print("="*70)
    print(f"Processing {len(enhanced_stock_data)} stocks...")
    print("This will take 5-10 minutes...\n")
    
    all_ensemble_results = {}
    ensemble_summary = []
    
    for idx, (symbol, stock_data) in enumerate(enhanced_stock_data.items(), 1):
        print(f"\n[{idx}/{len(enhanced_stock_data)}] Creating ensembles for {symbol}...")
        print("-"*70)
        
        try:
            # Load model predictions
            model_predictions = load_all_model_predictions(symbol, stock_data)
            
            if not model_predictions or len(model_predictions) < 2:
                print(f"  ✗ Insufficient models ({len(model_predictions) if model_predictions else 0}) for ensemble")
                continue
            
            print(f"  ✓ Loaded {len(model_predictions)} models")
            
            # Create ensembles
            ensemble_results = ensemble_trainer.train_all_ensembles(
                symbol, 
                model_predictions,
                len(stock_data['X_train']),
                len(stock_data['X_test']),
                stock_data.get('has_sentiment', False)
            )
            
            if ensemble_results:
                all_ensemble_results[symbol] = ensemble_results
                
                # Print performance
                for ensemble_type, data in ensemble_results.items():
                    metrics = data['metrics']
                    print(f"  {ensemble_type:20s} | RMSE: {metrics['rmse']:6.2f}% | "
                          f"Dir Acc: {metrics['directional_accuracy']:5.1f}%")
                    
                    ensemble_summary.append({
                        'symbol': symbol,
                        'ensemble_type': ensemble_type,
                        'rmse': metrics['rmse'],
                        'mae': metrics['mae'],
                        'dir_acc': metrics['directional_accuracy'],
                        'has_sentiment': stock_data.get('has_sentiment', False)
                    })
                
                # Show best ensemble for this stock
                best_ensemble = max(ensemble_results.items(), 
                                  key=lambda x: x[1]['metrics']['directional_accuracy'])
                print(f"  → Best: {best_ensemble[0]} ({best_ensemble[1]['metrics']['directional_accuracy']:.1f}%)")
            else:
                print(f"  ✗ Failed to create ensembles")
                
        except Exception as e:
            print(f"  ✗ Error: {str(e)[:70]}")
            continue
    
    print("\n" + "="*70)
    print("ENSEMBLE TRAINING COMPLETE!")
    print("="*70)
    
    # Save all ensemble results
    with open('ensemble_results.pkl', 'wb') as f:
        pickle.dump(all_ensemble_results, f)
    print("\n✓ Saved ensemble results to: ensemble_results.pkl")

else:
    print("\n✗ No stock data available for ensemble training")

In [None]:
# ============================================================================
# CELL 8: Ensemble Performance Analysis
# ============================================================================

if ensemble_summary:
    ensemble_df = pd.DataFrame(ensemble_summary)
    
    print("\n" + "="*70)
    print("ENSEMBLE PERFORMANCE ANALYSIS")
    print("="*70)
    
    # Overall statistics by ensemble type
    print("\n1. AVERAGE PERFORMANCE BY ENSEMBLE TYPE")
    print("-"*70)
    
    ensemble_stats = ensemble_df.groupby('ensemble_type').agg({
        'rmse': ['mean', 'std', 'min', 'max'],
        'dir_acc': ['mean', 'std', 'min', 'max']
    }).round(2)
    
    print(ensemble_stats)
    
    # Best ensemble per stock
    print("\n2. BEST ENSEMBLE METHOD PER STOCK")
    print("-"*70)
    
    best_per_stock = ensemble_df.loc[ensemble_df.groupby('symbol')['dir_acc'].idxmax()]
    print(f"{'Symbol':<10} {'Best Method':<20} {'Dir Acc':<10} {'RMSE'}")
    print("-"*70)
    for _, row in best_per_stock.head(10).iterrows():
        print(f"{row['symbol']:<10} {row['ensemble_type']:<20} "
              f"{row['dir_acc']:>6.1f}%    {row['rmse']:>6.2f}%")
    
    # Ensemble method ranking
    print("\n3. ENSEMBLE METHOD RANKING (by avg directional accuracy)")
    print("-"*70)
    
    method_ranking = ensemble_df.groupby('ensemble_type')['dir_acc'].mean().sort_values(ascending=False)
    for method, avg_acc in method_ranking.items():
        print(f"{method:<25} {avg_acc:>6.2f}%")
    
    # Improvement over best individual model
    print("\n4. ENSEMBLE VS INDIVIDUAL MODELS")
    print("-"*70)
    
    conn = mysql.connector.connect(**db_config)
    
    # Get best individual model performance per stock
    individual_query = """
    SELECT 
        symbol,
        MAX(directional_accuracy) as best_individual_acc,
        MIN(rmse) as best_individual_rmse
    FROM model_performance
    WHERE model_type NOT LIKE 'Ensemble%'
    GROUP BY symbol
    """
    
    individual_best = pd.read_sql(individual_query, conn)
    
    # Get best ensemble performance per stock
    ensemble_query = """
    SELECT 
        symbol,
        MAX(directional_accuracy) as best_ensemble_acc,
        MIN(rmse) as best_ensemble_rmse
    FROM model_performance
    WHERE model_type LIKE 'Ensemble%'
    GROUP BY symbol
    """
    
    ensemble_best = pd.read_sql(ensemble_query, conn)
    conn.close()
    
    # Merge and compare
    comparison = pd.merge(individual_best, ensemble_best, on='symbol', how='inner')
    comparison['acc_improvement'] = comparison['best_ensemble_acc'] - comparison['best_individual_acc']
    comparison['rmse_improvement'] = comparison['best_individual_rmse'] - comparison['best_ensemble_rmse']
    
    print(f"\nStocks where ensemble IMPROVED accuracy:")
    improved = comparison[comparison['acc_improvement'] > 0].sort_values('acc_improvement', ascending=False)
    print(f"{'Symbol':<10} {'Individual':<12} {'Ensemble':<12} {'Improvement'}")
    print("-"*70)
    for _, row in improved.head(10).iterrows():
        print(f"{row['symbol']:<10} {row['best_individual_acc']:>8.2f}%    "
              f"{row['best_ensemble_acc']:>8.2f}%    {row['acc_improvement']:>+6.2f}%")
    
    print(f"\nOverall Statistics:")
    print(f"  Stocks with improved accuracy: {len(improved)} / {len(comparison)}")
    print(f"  Average accuracy improvement: {comparison['acc_improvement'].mean():+.2f}%")
    print(f"  Average RMSE improvement: {comparison['rmse_improvement'].mean():+.3f}%")
    
else:
    print("\n✗ No ensemble summary data available")

In [None]:
# ============================================================================
# CELL 9: Save Summary Report
# ============================================================================

if ensemble_summary:
    print("\n" + "="*70)
    print("GENERATING SUMMARY REPORT")
    print("="*70)
    
    # Create comprehensive report
    report = {
        'ensemble_df': ensemble_df,
        'method_ranking': method_ranking,
        'best_per_stock': best_per_stock,
        'comparison': comparison if 'comparison' in locals() else None,
        'timestamp': datetime.now()
    }
    
    # Save report
    with open('ensemble_report.pkl', 'wb') as f:
        pickle.dump(report, f)
    
    # Save CSV for easy viewing
    ensemble_df.to_csv('ensemble_performance.csv', index=False)
    
    print("✓ Saved ensemble report to: ensemble_report.pkl")
    print("✓ Saved performance CSV to: ensemble_performance.csv")