In [20]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import Dict, List, Any, Optional
import json
import warnings
from datetime import datetime, timedelta
import logging
from pathlib import Path
from data_loading import build_panel_dataset

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Suppress warnings
warnings.filterwarnings('ignore')

In [21]:
# Define paths relative to the notebook location
data_folder = Path('..') / '..' / 'Data' / 'Short Interest Data'
crsp_file = Path('..') / '..' / 'Data' / 'CRSP Market Data 2.csv'
ibes_file = Path('..') / '..' / 'Data' / 'IBES Recommendations.csv'
compustat_file = Path('..') / '..' / 'Data' / 'Compustat Fundamentals.csv'

# Load with your paths
panel_df = build_panel_dataset(
    data_folder=data_folder,
    crsp_file=crsp_file,
    ibes_file=ibes_file,
    compustat_file=compustat_file
)
panel_df = panel_df.set_index(['PERMNO', 'date'])

INFO:data_loading:Starting panel dataset construction...
INFO:data_loading:Found 110 CSV files to merge
INFO:data_loading:Successfully loaded shrt20230315.csv with 19,946 rows
INFO:data_loading:Successfully loaded shrt20221230.csv with 20,186 rows
INFO:data_loading:Successfully loaded shrt20220729.csv with 20,601 rows
INFO:data_loading:Successfully loaded shrt20220715.csv with 20,732 rows
INFO:data_loading:Successfully loaded shrt20210615.csv with 20,251 rows
INFO:data_loading:Successfully loaded shrt20240430.csv with 19,468 rows
INFO:data_loading:Successfully loaded shrt20221031.csv with 20,696 rows
INFO:data_loading:Successfully loaded shrt20240815.csv with 19,530 rows
INFO:data_loading:Successfully loaded shrt20220930.csv with 20,742 rows
INFO:data_loading:Successfully loaded shrt20210415.csv with 19,816 rows
INFO:data_loading:Successfully loaded shrt20241115.csv with 19,729 rows
INFO:data_loading:Successfully loaded shrt20241129.csv with 19,946 rows
INFO:data_loading:Successfully l

(np.int64(84588), Timestamp('2021-01-04 00:00:00'))

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import json

class FeatureEngineeringAgentLLM:
    def __init__(self, model_name: str):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def _build_prompt(self, data_analysis: Dict, target_column: str) -> str:
        prompt = f"""
You are a feature engineering expert for time series forecasting.

Data Analysis:
- Shape: {data_analysis['shape']}
- Target Column: {target_column}
- Numeric Columns: {data_analysis['numeric_columns']}
- Categorical Columns: {data_analysis['categorical_columns']}
- Time Columns: {data_analysis.get('time_columns', [])}

Task: Generate a feature engineering strategy for bi-weekly rate forecasting.

Provide your response as a JSON with these sections:
1. "lag_features": List of lag periods to create (e.g., [1, 2, 4, 8])
2. "rolling_features": Rolling window calculations (e.g., ["mean_7", "std_14"])
3. "interaction_features": Feature combinations to try
4. "time_features": Time-based features to extract
5. "reasoning": Explanation of your strategy

Response:
"""
        return prompt

    def _call_llm(self, prompt: str) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=300,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        strategy_text = response.split("Response:")[-1].strip()
        return strategy_text


    def _create_lag_features(self, data: pd.DataFrame, lag_list: list, numeric_cols: list) -> pd.DataFrame:
        for lag in lag_list:
            for col in numeric_cols:
                data[f"{col}_lag_{lag}"] = data.groupby(level='PERMNO')[col].shift(lag)
        return data

    def _create_rolling_features(self, data: pd.DataFrame, rolling_list: list, numeric_cols: list) -> pd.DataFrame:
        for feature in rolling_list:
            if feature.startswith("mean_"):
                window = int(feature.split("_")[1])
                for col in numeric_cols:
                    data[f"{col}_rolling_mean_{window}"] = data.groupby(level='PERMNO')[col].rolling(window=window, min_periods=1).mean().reset_index(level=0, drop=True)
            elif feature.startswith("std_"):
                window = int(feature.split("_")[1])
                for col in numeric_cols:
                    data[f"{col}_rolling_std_{window}"] = data.groupby(level='PERMNO')[col].rolling(window=window, min_periods=1).std().reset_index(level=0, drop=True)
        return data

    def _create_time_features(self, data: pd.DataFrame, time_features: list) -> pd.DataFrame:
        # Extract time from the 'date' level in MultiIndex
        if 'date' in data.index.names:
            dates = data.index.get_level_values('date')
            if "month" in time_features:
                data['month'] = dates.month
            if "quarter" in time_features:
                data['quarter'] = dates.quarter
            if "day_of_week" in time_features:
                data['day_of_week'] = dates.dayofweek
            if "is_weekend" in time_features:
                data['is_weekend'] = dates.dayofweek.isin([5, 6]).astype(int)
        return data

    def process_features(self, data: pd.DataFrame, target_column: str, data_analysis: Dict) -> Dict[str, Any]:
        prompt = self._build_prompt(data_analysis, target_column)
        
        # Replace mock with real LLM call when ready:
        # strategy_text = '''
        #     {
        #     "lag_features": [1, 2, 4],
        #     "rolling_features": ["mean_7", "std_7"],
        #     "interaction_features": [],
        #     "time_features": ["month", "quarter", "day_of_week", "is_weekend"],
        #     "reasoning": "Use recent lags, weekly rolling stats, and common time features to capture seasonality."
        #     }
        #             '''
        strategy_text = self._call_llm(prompt)
        print('LLM strategy')
        print(strategy_text)
        try:
            strategy = json.loads(strategy_text)
        except json.JSONDecodeError:
            strategy = {
                "lag_features": [1,2],
                "rolling_features": ["mean_7"],
                "interaction_features": [],
                "time_features": ["month"],
                "reasoning": "Fallback strategy."
            }

        processed_data = data.copy()
        
        # Create features as per strategy
        processed_data = self._create_lag_features(processed_data, strategy.get("lag_features", []), data_analysis['numeric_columns'])
        processed_data = self._create_rolling_features(processed_data, strategy.get("rolling_features", []), data_analysis['numeric_columns'])
        processed_data = self._create_time_features(processed_data, strategy.get("time_features", []))

        # Dropping NaN rows generated by lags and rolling features
        processed_data = processed_data.dropna(how='all').fillna(0)

        # Final feature names
        feature_names = list(processed_data.columns)

        return {
            "strategy": strategy,
            "processed_data": processed_data,
            "feature_names": feature_names,
            "engineering_log": f"Applied feature engineering strategy: {strategy.get('reasoning')}"
        }


In [None]:
# # Initialize the communicating framework
# def create_communicating_framework():
#     feature_engineer = CommunicatingFeatureEngineerLLM('microsoft/Phi-4-mini-instruct')
#     forecaster = CommunicatingForecastingAgentLLM('microsoft/Phi-4-mini-instruct')
#     orchestrator = CommunicatingOrchestrator(feature_engineer, forecaster)
#     return orchestrator

# # Run the framework
# communicating_framework = create_communicating_framework()

# # Single run with communication
# results = communicating_framework.execute_communicating_workflow(
#     panel_df[col], 'currentShortPositionQuantity'
# )

# print("Communication Log:")
# for log_entry in results['communication_log']:
#     print(f"  {log_entry['sender']} → {log_entry['receiver']}: {log_entry['type']}")

# # Iterative improvement (this is where you'll see the learning!)
# iterative_results = communicating_framework.run_iterative_improvement(
#     panel_df[col], 'currentShortPositionQuantity', iterations=3
# )


### Cleaned version

##### Communication Framework
- MessageType
- AgentMessage
- CommunicatingAgent
- SimpleMessageBus

In [4]:
import uuid
from dataclasses import dataclass
from typing import Dict, Any, List, Optional
from enum import Enum

## Communication framework

class MessageType(Enum):
    """ Enumerates the possible types of messages exchanged by agents. """

    FEATURE_REQUEST = "feature_request"
    FEATURE_RESPONSE = "feature_response" 
    PERFORMANCE_FEEDBACK = "performance_feedback"
    MODEL_REQUEST = "model_request"
    MODEL_RESPONSE = "model_response"
    IMPROVEMENT_SUGGESTION = "improvement_suggestion"


@dataclass
class AgentMessage:
    """Represents a message passed between agents in a multi-agent system.
    
    This dataclass encapsulates all necessary information for agent-to-agent
    communication, including sender/receiver identification, message content,
    and conversation tracking.
    
    Attributes:
        sender (str): Identifier of the agent sending the message
        receiver (str): Identifier of the agent receiving the message  
        message_type (MessageType): Type/category of the message
        content (Dict[str, Any]): The actual message payload as key-value pairs
        conversation_id (str): Unique identifier for the conversation thread
        timestamp (str, optional): ISO format timestamp, auto-generated if None
    """
    sender: str
    receiver: str
    message_type: MessageType
    content: Dict[str, Any]
    conversation_id: str
    timestamp: str = None
    
    def __post_init__(self): # Initialize values that depend on self or other values (here timestamp)
        if self.timestamp is None:
            self.timestamp = pd.Timestamp.now().isoformat()


class CommunicatingAgent:
    """Base class for agents that can communicate with other agents in a multi-agent system.
    
    This abstract base class provides the foundational communication infrastructure
    for agents operating within a coordinated multi-agent framework. It handles
    message routing, conversation tracking, and performance memory to enable agent-to-agent
    interactions and learning.
    
    The agent maintains a complete conversation history and performance memory
    to support learning and adaptation over time. Subclasses should override
    the process_message method to implement agent-specific message handling logic.
    
    Attributes:
        message_bus: Communication hub for routing messages between agents.
                    Set automatically when the agent is registered with the system.
        name (str): Unique identifier for this agent within the system.
                   Set automatically during registration.
        conversation_history (List[AgentMessage]): Complete record of all messages
                                                  received by this agent, ordered chronologically.
        performance_memory (List): Storage for performance feedback and successful
                                  interaction patterns to support learning and adaptation.
    """

    def __init__(self):
        self.message_bus = None          # Will be set when registered
        self.name = None                 # Will be set when registered  
        self.conversation_history = []   # All messages this agent received
        self.performance_memory = []     # Remember what worked well
    
    def receive_message(self, message: AgentMessage):
        """Handle incoming messages"""
        self.conversation_history.append(message)  # Keep record
        return self.process_message(message)       # Handle the message

    def send_message(self, receiver: str, message_type: MessageType, content: Dict[str, Any], conversation_id: str = None):
        """Send a message to another agent"""
        if conversation_id is None:
            conversation_id = str(uuid.uuid4())  # Create new conversation if needed
        
        message = AgentMessage(
            sender=self.name,
            receiver=receiver,
            message_type=message_type,
            content=content,
            conversation_id=conversation_id
        )
        return self.message_bus.send_message(message)  # Use post office to send
    
    def process_message(self, message: AgentMessage):
        # Override in subclasses - each agent handles messages differently
        pass

    
class SimpleMessageBus:
    """Central communication hub for managing message routing in a multi-agent system.
    
    The SimpleMessageBus serves as the core infrastructure component that coordinates
    communication between agents in a multi-agent framework. It maintains an agent
    registry, handles message routing, and tracks conversation history and performance
    metrics for system monitoring and optimization.
    
    This class implements a simple publish-subscribe pattern where agents register
    themselves to send and receive messages.
    
    Attributes:
        agents (Dict[str, CommunicatingAgent]): Registry mapping agent names to 
                                               their corresponding agent instances.
        message_history (List[AgentMessage]): Chronological record of all messages
                                            passed through the system for debugging purposes.
        performance_tracker (Dict): Storage for conversation performance metrics
                                   and system optimization data.
    """
    def __init__(self):
        self.agents = {}                # Registry of all agents
        self.message_history = []       # Keep track of all messages
        self.performance_tracker = {}   # Track how well conversations went
    
    def register_agent(self, name: str, agent):
        """Add an agent to the system"""
        self.agents[name] = agent
        agent.message_bus = self        # Give agent access to send messages
        agent.name = name              # Tell agent its own name
    
    def send_message(self, message: AgentMessage):
        """Deliver a message to the right agent"""
        self.message_history.append(message)    # Keep record
        if message.receiver in self.agents:
            return self.agents[message.receiver].receive_message(message)
        else:
            print(f"Warning: Agent {message.receiver} not found")
    
    def track_performance(self, conversation_id: str, metrics: Dict[str, Any]):
        """Track performance metrics for a specific conversation.
        
        Args:
            conversation_id (str): Unique identifier for the conversation
            metrics (Dict[str, Any]): Performance metrics to store
        """
        if conversation_id not in self.performance_tracker:
            self.performance_tracker[conversation_id] = []
        
        self.performance_tracker[conversation_id].append({
            'timestamp': pd.Timestamp.now().isoformat(),
            'metrics': metrics
        })
        
        print(f"Tracked performance for conversation {conversation_id[:8]}...")

##### FeatureEngineeringAgent (to be replaced by Josh's agent)

In [5]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

class MockFeatureEngineeringAgent:
    """Mock feature engineering agent for testing forecasting components"""
    
    def __init__(self, model_name: str):
        # Predefined strategies for different scenarios
        self.default_strategy = {
            "lag_features": [1, 2, 7, 14],  # 1-day, 2-day, 1-week, 2-week lags
            "rolling_features": ["mean_7", "mean_14", "std_7", "std_14"],
            "interaction_features": ["price_volume", "ret_volume"],
            "time_features": ["month", "quarter", "day_of_week", "is_weekend"],
            "reasoning": "Applied standard financial time series features: recent lags, rolling statistics, and time-based features for seasonality capture."
        }
        self.model_name = model_name # For compatibility with the real feature engineer
    
    def _create_lag_features(self, data: pd.DataFrame, lag_list: list, numeric_cols: list) -> pd.DataFrame:
        """Create lagged features grouped by PERMNO"""
        data_copy = data.copy()
        for lag in lag_list:
            for col in numeric_cols:
                if col in data_copy.columns:
                    data_copy[f"{col}_lag_{lag}"] = data_copy.groupby(level='PERMNO')[col].shift(lag)
        return data_copy
    
    def _create_rolling_features(self, data: pd.DataFrame, rolling_list: list, numeric_cols: list) -> pd.DataFrame:
        """Create rolling window features grouped by PERMNO"""
        data_copy = data.copy()
        for feature in rolling_list:
            feature_parts = feature.split("_")
            stat_type = feature_parts[0]
            window = int(feature_parts[1])
            
            for col in numeric_cols:
                if col in data_copy.columns:
                    grouped = data_copy.groupby(level='PERMNO')[col].rolling(window=window, min_periods=1)
                    if stat_type == "mean":
                        data_copy[f"{col}_rolling_mean_{window}"] = grouped.mean().reset_index(level=0, drop=True)
                    elif stat_type == "std":
                        data_copy[f"{col}_rolling_std_{window}"] = grouped.std().reset_index(level=0, drop=True)
        return data_copy
    
    def _create_interaction_features(self, data: pd.DataFrame, interaction_list: list) -> pd.DataFrame:
        """Create interaction features"""
        data_copy = data.copy()
        for interaction in interaction_list:
            if interaction == "price_volume" and 'PRC' in data_copy.columns and 'VOL' in data_copy.columns:
                data_copy['price_volume_interaction'] = data_copy['PRC'] * data_copy['VOL']
            elif interaction == "ret_volume" and 'RET' in data_copy.columns and 'VOL' in data_copy.columns:
                data_copy['ret_volume_interaction'] = data_copy['RET'] * data_copy['VOL']
        return data_copy
    
    def _create_time_features(self, data: pd.DataFrame, time_features: list) -> pd.DataFrame:
        """Create time-based features from date index"""
        data_copy = data.copy()
        if 'date' in data_copy.index.names:
            dates = data_copy.index.get_level_values('date')
            if "month" in time_features:
                data_copy['month'] = dates.month
            if "quarter" in time_features:
                data_copy['quarter'] = dates.quarter
            if "day_of_week" in time_features:
                data_copy['day_of_week'] = dates.dayofweek
            if "is_weekend" in time_features:
                data_copy['is_weekend'] = dates.dayofweek.isin([5, 6]).astype(int)
        return data_copy
    
    def _call_llm(self, prompt: str):
        """Mock LLM call"""
        return 'Text returned by LLM'

    def process_features(self, data: pd.DataFrame, target_column: str, data_analysis: Dict) -> Dict[str, Any]:
        """
        Mock feature processing that applies a predefined strategy
        
        Args:
            data: Input DataFrame with MultiIndex (PERMNO, date)
            target_column: Target variable name
            data_analysis: Data analysis dictionary (for compatibility)
            
        Returns:
            Dictionary with processed data and metadata
        """
        print("🔧 Applying mock feature engineering strategy...")
        
        # Use default strategy
        strategy = self.default_strategy.copy()
        
        # Apply feature engineering steps
        processed_data = data.copy()
        
        # 1. Lag features
        processed_data = self._create_lag_features(
            processed_data, 
            strategy["lag_features"], 
            data_analysis['numeric_columns']
        )
        
        # 2. Rolling features  
        processed_data = self._create_rolling_features(
            processed_data, 
            strategy["rolling_features"], 
            data_analysis['numeric_columns']
        )
        
        # 3. Interaction features
        processed_data = self._create_interaction_features(
            processed_data, 
            strategy["interaction_features"]
        )
        
        # 4. Time features
        processed_data = self._create_time_features(
            processed_data, 
            strategy["time_features"]
        )
        
        # Clean up data (handle NaNs from lag/rolling operations)
        initial_shape = processed_data.shape
        processed_data = processed_data.dropna(how='all').fillna(0)
        final_shape = processed_data.shape
        
        print(f"✅ Mock Feature engineering complete: {initial_shape[0]} → {final_shape[0]} samples, {initial_shape[1]} → {final_shape[1]} features")
        
        return {
            "strategy": strategy,
            "processed_data": processed_data,
            "feature_names": list(processed_data.columns),
            "engineering_log": f"Mock feature engineering applied: {len(strategy['lag_features'])} lag features, {len(strategy['rolling_features'])} rolling features, {len(strategy['time_features'])} time features. Rows dropped due to NaNs: {initial_shape[0] - final_shape[0]}"
        }

##### Communicating Feature Engineer

In [8]:
class CommunicatingFeatureEngineerLLM(CommunicatingAgent, MockFeatureEngineeringAgent):
    """Feature engineering agent that learns from performance feedback via communication.
    
    This agent combines feature engineering capabilities with multi-agent communication
    to iteratively improve feature generation strategies based on forecasting performance
    feedback. It maintains a history of feature performance and uses this context to
    generate enhanced feature engineering strategies for bi-weekly panel data forecasting.
    
    The agent inherits communication capabilities from CommunicatingAgent and feature
    engineering logic from FeatureEngineeringAgentLLM, extending both with performance-
    aware feature strategy generation.
    
    Attributes:
        feature_performance_history (List[Dict]): Historical record of performance 
                                                 feedback including metrics, features used,
                                                 and feature importance scores.
    """

    def __init__(self, model_name: str):
        CommunicatingAgent.__init__(self)
        MockFeatureEngineeringAgent.__init__(self, model_name)
        self.feature_performance_history = []
    
    def process_message(self, message: AgentMessage):
        """Process incoming messages based on message type.
        
        Routes messages to appropriate handlers based on MessageType.
        Handles PERFORMANCE_FEEDBACK and FEATURE_REQUEST message types.
        
        Args:
            message (AgentMessage): Incoming message containing type and content
            
        Returns:
            Response from the appropriate message handler, or None if unhandled
        """

        if message.message_type == MessageType.PERFORMANCE_FEEDBACK:
            return self.handle_performance_feedback(message)
        elif message.message_type == MessageType.FEATURE_REQUEST:
            return self.handle_feature_request(message)


    def handle_performance_feedback(self, message: AgentMessage):
        """Learn from forecasting performance to improve future feature engineering"""
        feedback = message.content
        
        # Store performance history
        self.feature_performance_history.append({
            'timestamp': message.timestamp,
            'metrics': feedback.get('metrics', {}),
            'features_used': feedback.get('features_used', []),
            'feature_importance': feedback.get('feature_importance', {}),
            'conversation_id': message.conversation_id
        })
        
        # Generate improvement suggestions
        improvements = self._generate_improvements_from_feedback(feedback)
        
        # Send back suggestions
        response = self.send_message(
            receiver=message.sender,
            message_type=MessageType.IMPROVEMENT_SUGGESTION,
            content=improvements,
            conversation_id=message.conversation_id
        )
        
        return response
    

    def _generate_improvements_from_feedback(self, feedback: Dict[str, Any]) -> Dict[str, Any]:
        """Generate specific improvements based on performance feedback"""
        metrics = feedback.get('metrics', {})
        feature_importance = feedback.get('feature_importance', {})
        features_used = feedback.get('features_used', [])
        
        # Simple rules for improvement (can be enhanced with LLM later)
        improvements = {
            'analysis': {},
            'suggested_changes': {},
            'reasoning': ""
        }
        
        # Analyze performance
        mae = metrics.get('MAE', float('inf'))
        r2 = metrics.get('R2', 0)
        
        improvements['analysis'] = {
            'performance_level': 'good' if r2 > 0.7 else 'moderate' if r2 > 0.3 else 'poor',
            'mae_level': 'good' if mae < 0.1 else 'moderate' if mae < 0.5 else 'poor'
        }
        
        # Feature-specific suggestions
        if feature_importance:
            top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:5]
            low_features = sorted(feature_importance.items(), key=lambda x: x[1])[:5]
            
            improvements['suggested_changes'] = {
                'keep_feature_types': [f.split('_')[0] for f, _ in top_features],
                'reduce_feature_types': [f.split('_')[0] for f, _ in low_features],
                'increase_lags': r2 < 0.5,  # If poor performance, try more lags
                'add_interactions': mae > 0.3  # If high error, try interactions
            }
        
        improvements['reasoning'] = f"Based on R2={r2:.3f} and MAE={mae:.3f}, suggesting feature adjustments"
        
        return improvements


    def _get_recent_performance_context(self) -> Optional[Dict[str, Any]]:
        """Get context from recent performance feedback"""
        if not self.feature_performance_history:
            return None
        
        # Get most recent feedback
        recent = sorted(self.feature_performance_history, key=lambda x: x['timestamp'])[-1]
        return recent    
 

    def process_features_with_communication(self, data: pd.DataFrame, target_column: str, 
                                        data_analysis: Dict, conversation_id: str = None) -> Dict[str, Any]:
        """Generate features using LLM strategy enhanced with performance context.
        
        Creates feature engineering strategy by combining data analysis with historical
        performance feedback. Uses LLM to generate strategy and applies feature
        transformations including lags, rolling statistics, and time features.
        
        Args:
            data (pd.DataFrame): Input panel data for feature engineering
            target_column (str): Name of the target variable column
            data_analysis (Dict): Data analysis summary containing shape, entities, etc.
            conversation_id (str, optional): Conversation identifier. Auto-generated if None.
            
        Returns:
            Dict[str, Any]: Dictionary containing:
                - strategy: LLM-generated feature engineering strategy
                - processed_data: DataFrame with engineered features
                - feature_names: List of all feature column names
                - engineering_log: Description of applied strategy
                - conversation_id: Conversation identifier
                - performance_context_used: Whether historical context was available
        """

        
        if conversation_id is None:
            conversation_id = str(uuid.uuid4())
        
        # Get historical context
        recent_performance = self._get_recent_performance_context()
        
        # Build enhanced prompt with performance context
        prompt = self._build_prompt_with_context(data_analysis, target_column, recent_performance)
        
        # Get LLM strategy
        strategy_text = self._call_llm(prompt)
        
        try:
            strategy = json.loads(strategy_text)
        except json.JSONDecodeError:
            strategy = self._get_fallback_strategy(recent_performance)
        
        # Apply strategy
        processed_data = data.copy()
        processed_data = self._create_lag_features(processed_data, strategy.get("lag_features", []), data_analysis['numeric_columns'])
        processed_data = self._create_rolling_features(processed_data, strategy.get("rolling_features", []), data_analysis['numeric_columns'])
        processed_data = self._create_time_features(processed_data, strategy.get("time_features", []))
        processed_data = processed_data.dropna(how='all').fillna(0)
        
        return {
            "strategy": strategy,
            "processed_data": processed_data,
            "feature_names": list(processed_data.columns),
            "engineering_log": f"Applied strategy with performance context: {strategy.get('reasoning')}",
            "conversation_id": conversation_id,
            "performance_context_used": recent_performance is not None
        }

    
    def _build_prompt_with_context(self, data_analysis: Dict, target_column: str, 
                                 performance_context: Optional[Dict[str, Any]]) -> str:

        """Build LLM prompt incorporating performance context.
        
        Args:
            context: Performance context information
            
        Returns:
            str: Enhanced prompt string with context
        """
        
        context_section = ""
        if performance_context:
            metrics = performance_context['metrics']
            top_features = list(performance_context.get('feature_importance', {}).keys())[:5]
            
            context_section = f"""
Previous Performance Context:
- Last R2: {metrics.get('R2', 'N/A'):.3f}
- Last MAE: {metrics.get('MAE', 'N/A'):.3f}
- Top performing features: {top_features}

Use this context to improve the strategy.
"""
        
        prompt = f"""
You are a feature engineering expert for bi-weekly forecasting on panel data.

Data Analysis:
- Shape: {data_analysis['shape']}
- Target: {target_column} (bi-weekly frequency)
- Entities: {data_analysis.get('entities', 'Unknown')}
- Date range: {data_analysis.get('date_range', 'Unknown')}

{context_section}

Task: Create an improved feature engineering strategy.
Focus on bi-weekly patterns and cross-sectional variations.

Return JSON with:
1. "lag_features": [1, 2, 4, 8, 14] (include bi-weekly relevant lags)
2. "rolling_features": ["mean_7", "mean_14", "std_14"]  
3. "interaction_features": []
4. "time_features": ["month", "quarter", "day_of_week"]
5. "reasoning": Your strategy explanation

Response:
"""
        return prompt
    

    def _get_fallback_strategy(self, performance_context: Optional[Dict[str, Any]]) -> Dict[str, Any]:
        """Fallback strategy based on performance context"""
        if performance_context and performance_context['metrics'].get('R2', 0) < 0.3:
            # Poor performance - try more features
            return {
                "lag_features": [1, 2, 4, 8, 14],
                "rolling_features": ["mean_7", "mean_14", "std_7", "std_14"],
                "interaction_features": [],
                "time_features": ["month", "quarter", "day_of_week", "is_weekend"],
                "reasoning": "Expanding feature set due to poor previous performance"
            }
        else:
            # Default strategy
            return {
                "lag_features": [1, 2, 4],
                "rolling_features": ["mean_7"],
                "interaction_features": [],
                "time_features": ["month"],
                "reasoning": "Standard fallback strategy"
            }    

#### Forecasting Agent

In [None]:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Any
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import optuna
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')


class ForecastingAgentLLM:
    """XGBoost forecasting agent with Optuna hyperparameter optimization and optional LLM integration.
    
    This agent performs bi-weekly financial forecasting on panel data using XGBoost regression
    with automated hyperparameter tuning via Optuna. It includes time series cross-validation,
    feature scaling, and comprehensive performance metrics calculation.
    
    The agent can optionally integrate with Large Language Models for enhanced feature
    engineering and strategy generation when initialized with a model_name.
    
    Attributes:
        model (xgb.XGBRegressor): XGBoost regression model for forecasting
        scaler (StandardScaler): Feature scaling transformer
        best_params (Dict, optional): Optimized hyperparameters from Optuna trials
        model_name (str, optional): Hugging Face model identifier for LLM integration
        tokenizer (AutoTokenizer, optional): Tokenizer for LLM processing
        model_llm (AutoModelForCausalLM, optional): LLM model for text generation
    """


    def __init__(self, model_name: str = None):
        self.model = xgb.XGBRegressor(random_state=42, verbosity=0)
        self.scaler = StandardScaler()
        self.best_params = None
        self.model_name = model_name

        # Initialize LLM components if provided
        if model_name:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model_llm = AutoModelForCausalLM.from_pretrained(model_name)
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token

    def _prepare_biweekly_data(self, data: pd.DataFrame, target_column: str) -> pd.DataFrame:
        """Extract bi-weekly target observation points while preserving daily feature data.
        
        Identifies dates where the target variable actually changed to create training
        samples at bi-weekly frequency, avoiding duplicate target values while maintaining
        the full feature set for each entity.
        
        Args:
            data (pd.DataFrame): Panel data with ('PERMNO', 'date') MultiIndex
            target_column (str): Name of bi-weekly target variable column
            
        Returns:
            pd.DataFrame: Filtered data containing only bi-weekly target change points
            
        Note:
            Handles duplicate indices and maintains entity-level data integrity.
            Prints warnings for processing errors but continues with valid entities.
        """

        # if data.index.names[0] != 'PERMNO':
        #     data_sorted = data.swaplevel().sort_index()
        # else:
        #     data_sorted = data.sort_index()

        # if not data_sorted.index.is_unique:
        #     print(f"Warning: Found {data_sorted.index.duplicated().sum()} duplicate index entries. Removing duplicates.")
        #     data_sorted = data_sorted[~data_sorted.index.duplicated(keep='last')]

        # biweekly_data = []
        # unique_permnos = data_sorted.index.get_level_values('PERMNO').unique()

        # for permno in unique_permnos:
        #     try:
        #         entity_data = data_sorted.loc[permno]

        #         if isinstance(entity_data, pd.Series):
        #             entity_data = entity_data.to_frame().T
        #             entity_data.index = [data_sorted.loc[permno].name[1]] if hasattr(data_sorted.loc[permno].name, '__getitem__') else [entity_data.index[0]]

        #         # Get dates where target actually changed (bi-weekly points)
        #         target_changes = entity_data[target_column] != entity_data[target_column].shift(1)
        #         biweekly_points = entity_data[target_changes | (entity_data.index == entity_data.index[0])]

        #         # Add PERMNO back to index
        #         biweekly_points_indexed = biweekly_points.copy()
        #         biweekly_points_indexed['PERMNO'] = permno
        #         biweekly_points_indexed = biweekly_points_indexed.set_index('PERMNO', append=True).swaplevel()

        #         biweekly_data.append(biweekly_points_indexed)

        #     except (KeyError, Exception) as e:
        #         print(f"Error processing PERMNO {permno}: {e}")
        #         continue

        # return pd.concat(biweekly_data) if biweekly_data else pd.DataFrame()


        #### CHANGED TO THIS IF WE ONLY USE BI-WEEKLY DATA FOR NOW
        df = data.sort_index(level=['date','PERMNO'])
        return df[df[target_column].notna()]
    
    def _calculate_metrics(self, y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
        """Calculate comprehensive evaluation metrics"""
        try:
            mae = mean_absolute_error(y_true, y_pred)
            rmse = np.sqrt(mean_squared_error(y_true, y_pred))

            # Avoid division by zero in MAPE calculation
            mask = y_true != 0
            if np.sum(mask) > 0:
                mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
            else:
                mape = float('inf')

            # R-squared
            ss_res = np.sum((y_true - y_pred) ** 2)
            ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
            r2 = 1 - (ss_res / ss_tot) if ss_tot != 0 else 0

            return {
                'MAE': float(mae),
                'RMSE': float(rmse), 
                'MAPE': float(mape),
                'R2': float(r2)
            }
        except Exception as e:
            print(f"Error calculating metrics: {e}")
            return {'MAE': float('inf'), 'RMSE': float('inf'), 'MAPE': float('inf'), 'R2': 0}

    def _prepare_features_and_target(self, data: pd.DataFrame, target_column: str):
        """Prepare features and target for training"""
        feature_cols = [c for c in data.columns if c != target_column]
        X_df = data[feature_cols].select_dtypes(include=[np.number]).copy().fillna(0)
        y = data[target_column].values
        mask = ~np.isnan(y)
        X = X_df.values[mask]
        y = y[mask]
        feature_names = X_df.columns.tolist()
        row_index = data.index[mask]
        return X, y, feature_names, row_index


    def _get_xgboost_param_space(self) -> Dict:
        """Optimized XGBoost hyperparameter space for financial data"""
        return {
            'n_estimators': (50, 200),      # Reasonable range for quick training
            'max_depth': (3, 6),            # Prevent overfitting
            'learning_rate': (0.05, 0.2),   # Conservative learning rates
            'subsample': (0.7, 1.0),        # Sample fraction
            'colsample_bytree': (0.7, 1.0), # Feature fraction
            'reg_alpha': (0, 0.5),          # L1 regularization
            'reg_lambda': (0, 1.0),         # L2 regularization
            'min_child_weight': (1, 5)      # Minimum samples in leaf
        }


    def _date_based_splits(index: pd.MultiIndex,
                        n_splits: int = 3,
                        purge: int = 0,
                        expanding: bool = True):
        """
        Build leakage-safe CV folds on a panel with MultiIndex ('PERMNO','date').

        index: MultiIndex with a 'date' level
        n_splits: number of validation folds
        purge: number of unique dates to skip between train end and val start
        expanding: if True, expanding train window; else rolling window of size ~1 fold
        """
        if 'date' not in index.names:
            raise ValueError("Index must contain a 'date' level")
        # Ensure stable order: sort by ['date','PERMNO']
        if list(index.names) != ['PERMNO', 'date']:
            df_tmp = pd.DataFrame(index=index).reset_index().set_index(['PERMNO','date']).sort_index()
            index = df_tmp.index
        else:
            index = index.swaplevel().sort_values()  # ('date','PERMNO')

        udates = np.array(sorted(index.get_level_values('date').unique()))
        if n_splits < 1:
            raise ValueError("n_splits must be >= 1")
        if len(udates) < (n_splits + 1):
            raise ValueError("Not enough unique dates for the requested n_splits")

        fold_size = len(udates) // (n_splits + 1)
        dates_vals = index.get_level_values('date')
        splits = []
        for i in range(n_splits):
            # Train window
            train_start_idx = 0 if expanding else i * fold_size
            train_end_idx = (i + 1) * fold_size - 1
            # Validation window (with purge gap)
            val_start_idx = (i + 1) * fold_size + purge
            val_end_idx = min((i + 2) * fold_size, len(udates) - 1)
            if val_start_idx > val_end_idx:
                break

            train_start, train_end = udates[train_start_idx], udates[train_end_idx]
            val_start, val_end = udates[val_start_idx], udates[val_end_idx]

            train_mask = (dates_vals >= train_start) & (dates_vals <= train_end)
            val_mask = (dates_vals >= val_start) & (dates_vals <= val_end)

            train_idx = np.where(train_mask)
            val_idx = np.where(val_mask)
            if len(train_idx) == 0 or len(val_idx) == 0:
                continue

            splits.append((train_idx, val_idx))
        return splits


    def optimize_and_forecast(self, data: pd.DataFrame, target_column: str, 
                             n_trials: int = 20, cv_splits: int = 3, purge: int = 0) -> Dict[str, Any]:
        """
        Main method: Optimize XGBoost hyperparameters and generate forecasts

        Args:
            data: Panel DataFrame with engineered features
            target_column: Target variable name
            n_trials: Number of Optuna trials for optimization
            cv_splits: Number of time series CV splits

        Returns:
            Dictionary with optimization results, metrics, and forecasts
        """
        print("Starting XGBoost optimization and forecasting...")

        # Prepare bi-weekly data
        biweekly_data = self._prepare_biweekly_data(data, target_column)

        if len(biweekly_data) < 20:
            return {"error": "Insufficient bi-weekly data for optimization"}

        # Prepare features and target
        X, y, feature_names, row_index = self._prepare_features_and_target(biweekly_data, target_column)

        # Build date-based splits from the aligned index
        splits = self._date_based_splits(row_index, n_splits=cv_splits, purge=purge, expanding=True)

        print(f"Training data: {len(X)} samples, {len(feature_names)} features")

        # Create Optuna study with TPE sampler for efficiency
        study = optuna.create_study(direction='maximize',
                                    sampler=optuna.samplers.TPESampler(seed=42, n_startup_trials=5))

        def objective(trial):
            params = {}
            for name, rng in self._get_xgboost_param_space().items():
                if isinstance(rng, int):
                    params[name] = trial.suggest_int(name, rng, rng[1])
                else:
                    params[name] = trial.suggest_float(name, rng, rng[1])

            scores = []
            for train_idx, val_idx in splits:
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X[train_idx])
                X_val = scaler.transform(X[val_idx])
                model = xgb.XGBRegressor(random_state=42, verbosity=0, **params)
                model.fit(X_train, y[train_idx])
                y_pred = model.predict(X_val)
                scores.append(-mean_absolute_error(y[val_idx], y_pred))  # negative MAE
            return float(np.mean(scores))

        study.optimize(objective, n_trials=n_trials, show_progress_bar=False)

        # Get best parameters
        best_params = study.best_params
        print(f"Best parameters found: {best_params}")

        # Train final model with best parameters and full data
        X_scaled = self.scaler.fit_transform(X)
        self.model = xgb.XGBRegressor(random_state=42, verbosity=0, **best_params)
        self.model.fit(X_scaled, y)
        self.best_params = best_params

        # Calculate cross-validation metrics with best parameters
        tscv = TimeSeriesSplit(n_splits=cv_splits)
        cv_scores = []

        for train_idx, val_idx in tscv.split(X):
            X_train_cv, X_val_cv = X[train_idx], X[val_idx]
            y_train_cv, y_val_cv = y[train_idx], y[val_idx]

            scaler_cv = StandardScaler()
            X_train_scaled = scaler_cv.fit_transform(X_train_cv)
            X_val_scaled = scaler_cv.transform(X_val_cv)

            model_cv = xgb.XGBRegressor(random_state=42, verbosity=0, **best_params)
            model_cv.fit(X_train_scaled, y_train_cv)
            y_pred_cv = model_cv.predict(X_val_scaled)

            metrics = self._calculate_metrics(y_val_cv, y_pred_cv)
            cv_scores.append(metrics)

        # Calculate average metrics
        avg_metrics = {}
        for metric in ['MAE', 'RMSE', 'MAPE', 'R2']:
            values = [score[metric] for score in cv_scores if not np.isinf(score[metric])]
            avg_metrics[f'avg_{metric}'] = np.mean(values) if values else float('inf')
            avg_metrics[f'std_{metric}'] = np.std(values) if values else 0

        # Generate predictions for latest data (next period forecast)
        latest_data = data.groupby(level='PERMNO').tail(1)
        X_latest, _, _ = self._prepare_features_and_target(latest_data, target_column)
        X_latest_scaled = self.scaler.transform(X_latest)
        future_predictions = self.model.predict(X_latest_scaled)

        # Get feature importance
        feature_importance = dict(zip(feature_names, self.model.feature_importances_))
        top_features = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:10])

        return {
            'optimization_results': {
                'best_params': best_params,
                'best_cv_score': study.best_value,
                'n_trials_completed': len(study.trials),
                'optimization_time': 'Fast (< 5 minutes typical)'
            },
            'cross_validation_metrics': {
                'average_metrics': avg_metrics,
                'individual_fold_scores': cv_scores,
                'num_cv_folds': cv_splits
            },
            'forecasts': {
                'next_period_predictions': future_predictions.tolist(),
                'num_entities_forecasted': len(future_predictions),
                'training_samples_used': len(biweekly_data)
            },
            'model_insights': {
                'feature_count': len(feature_names),
                'top_feature_importance': top_features,
                'model_type': 'XGBoost Regressor'
            }
        }

    def predict_latest(self, data: pd.DataFrame, target_column: str) -> np.ndarray:
        """
        Generate predictions for the latest data using trained model

        Args:
            data: Panel DataFrame with features
            target_column: Target variable name

        Returns:
            Array of predictions for latest period
        """
        if self.model is None:
            raise ValueError("Model not trained. Run optimize_and_forecast() first.")

        latest_data = data.groupby(level='PERMNO').tail(1)
        X_latest, _, _ = self._prepare_features_and_target(latest_data, target_column)
        X_latest_scaled = self.scaler.transform(X_latest)

        return self.model.predict(X_latest_scaled)

    def get_model_summary(self) -> Dict[str, Any]:
        """Get summary of the trained model"""
        if self.best_params is None:
            return {"error": "Model not trained yet"}

        return {
            'model_type': 'XGBoost Regressor',
            'best_hyperparameters': self.best_params,
            'is_trained': self.model is not None,
            'feature_scaler': 'StandardScaler applied'
        }

#### CommunicatingForecastingAgent

In [14]:
class CommunicatingForecastingAgent(CommunicatingAgent, ForecastingAgentLLM):
    """Forecasting agent that provides performance feedback to feature engineers via communication.
    
    This agent combines XGBoost forecasting capabilities with multi-agent communication
    to create a feedback loop for feature engineering optimization. After running
    forecasts with hyperparameter optimization, it automatically sends performance
    metrics and feature importance data to feature engineering agents.
    
    The agent inherits forecasting logic from ForecastingAgentLLM and communication
    capabilities from CommunicatingAgent, enabling coordinated learning in multi-agent
    quantitative finance systems.
    
    """

    def __init__(self, model_name: str = None):
        CommunicatingAgent.__init__(self)
        ForecastingAgentLLM.__init__(self, model_name)
    
    def validate_with_feedback(self, data: pd.DataFrame, target_column: str, conversation_id: str, n_trials: int=20, cv_splits: int=3) -> Dict[str, Any]:
        """Run forecasting validation and send performance feedback to feature engineers.
        
        Performs XGBoost hyperparameter optimization and cross-validation, then automatically
        sends comprehensive performance feedback to the feature engineering agent via the
        message bus. This enables continuous improvement of feature engineering strategies.
        
        Args:
            data (pd.DataFrame): Panel data with engineered features and target variable
            target_column (str): Name of the target variable column for forecasting
            conversation_id (str): Unique identifier for tracking the conversation thread
            n_trials (int): Number of Optuna hyperparameter optimization trials
            cv_splits (int): Number of time series cross-validation splits
            
        Returns:
            Dict[str, Any]: Complete forecasting results including:
                - optimization_results: Best parameters and optimization metrics
                - cross_validation_metrics: Performance metrics across CV folds
                - forecasts: Next period predictions for all entities
                - model_insights: Feature importance and model characteristics
        """


        # Run original validation
        results = self.optimize_and_forecast(data, target_column, n_trials = n_trials, cv_splits= cv_splits)
        
        # Extract feedback information
        if 'cross_validation_metrics' in results:
            feedback_content = {
                'metrics': results['cross_validation_metrics'].get('average_metrics', {}),
                'features_used': list(data.columns),
                'feature_importance': results['model_insights'].get('top_feature_importance', {}),
                'model_used': results['model_insights'].get('model_type'),
                'best_parameters': results['optimization_results'].get('best_params'),
                'best_cv_score': results['optimization_results'].get('best_cv_score'),
                'conversation_id': conversation_id
            }

            # Send feedback to feature engineer
            self.send_message(
                receiver='feature_engineer',
                message_type=MessageType.PERFORMANCE_FEEDBACK,
                content=feedback_content,
                conversation_id=conversation_id
            )
        
        return results

#### Orchestrator

In [15]:
class CommunicatingOrchestrator(CommunicatingAgent):
    """Orchestrator that coordinates multi-agent feature engineering and forecasting workflows.
    
    This orchestrator manages communication between feature engineering and forecasting agents
    to create an iterative improvement loop for quantitative finance modeling. It sets up
    the message bus, registers agents, and coordinates the complete workflow from data
    analysis through forecasting with performance feedback.
    
    The orchestrator tracks conversation history and enables iterative model improvement
    through agent-to-agent communication and performance monitoring.
    
    Attributes:
        feature_engineer (CommunicatingFeatureEngineerLLM): Feature engineering agent
        forecaster (CommunicatingForecastingAgentLLM): Forecasting agent  
        message_bus (SimpleMessageBus): Communication hub for agent messaging
    """

    def __init__(self, feature_engineer: CommunicatingFeatureEngineerLLM, 
                 forecaster: CommunicatingForecastingAgent):
        super().__init__()
        self.feature_engineer = feature_engineer
        self.forecaster = forecaster
        self.message_bus = SimpleMessageBus()
        
        # Register agents
        self.message_bus.register_agent('orchestrator', self)
        self.message_bus.register_agent('feature_engineer', feature_engineer)
        self.message_bus.register_agent('forecaster', forecaster)
    
    def execute_communicating_workflow(self, data: pd.DataFrame, target_column: str) -> Dict[str, Any]:
        """Execute complete multi-agent workflow with feature engineering and forecasting.
        
        Coordinates a full machine learning pipeline using communicating agents:
        1. Performs comprehensive data analysis
        2. Executes feature engineering with historical performance context
        3. Runs forecasting with hyperparameter optimization
        4. Sends performance feedback for continuous improvement
        5. Tracks communication history for workflow transparency
        
        Args:
            data (pd.DataFrame): Panel data with ('PERMNO', 'date') MultiIndex
            target_column (str): Name of the target variable for forecasting
            
        Returns:
            Dict[str, Any]: Comprehensive workflow results containing:
                - conversation_id: Unique identifier for this workflow execution
                - data_analysis: Summary statistics and data characteristics
                - engineered_features: Feature engineering results and strategy used
                - forecast_results: Complete forecasting results with metrics
                - communication_log: Record of all agent-to-agent messages
        """

        conversation_id = str(uuid.uuid4())
        
        print(f"Starting communicating workflow (ID: {conversation_id[:8]}...)")
        
        # Step 1: Data Analysis
        data_analysis = {
            "shape": data.shape,
            "columns": list(data.columns),
            "target_column": target_column,
            "numeric_columns": list(data.select_dtypes(include=['number']).columns),
            "categorical_columns": list(data.select_dtypes(include=['object', 'category']).columns),
            "date_range": (data.index.get_level_values('date').min(), data.index.get_level_values('date').max()),
            "entities": len(data.index.get_level_values('PERMNO').unique())
        }
        
        # Step 2: Feature Engineering with Communication
        print("Feature Engineering Agent working...")
        engineered_data = self.feature_engineer.process_features_with_communication(
            data, target_column, data_analysis, conversation_id
        )
        
        # Step 3: Forecasting with Feedback
        print("Forecasting Agent working...")
        forecast_results = self.forecaster.validate_with_feedback(
            engineered_data["processed_data"], target_column, conversation_id, n_trials=20, cv_splits=3
        )
        
        # Step 4: Track overall performance
        if 'cross_validation_metrics' in forecast_results:
            metrics = forecast_results['cross_validation_metrics'].get('average_metrics', {})
            self.message_bus.track_performance(conversation_id, metrics)
        
        return {
            "conversation_id": conversation_id,
            "data_analysis": data_analysis,
            "engineered_features": engineered_data,
            "forecast_results": forecast_results,
            "communication_log": [
                {
                    "sender": msg.sender,
                    "receiver": msg.receiver, 
                    "type": msg.message_type.value,
                    "timestamp": msg.timestamp
                }
                for msg in self.message_bus.message_history 
                if msg.conversation_id == conversation_id
            ]
        }
    
    
    def run_iterative_improvement(self, data: pd.DataFrame, target_column: str, iterations: int = 3):
        """Execute multiple workflow iterations to demonstrate continuous improvement.
        
        Runs the complete communicating workflow multiple times to show how agents
        learn and improve performance through feedback loops. Each iteration builds
        on the performance history from previous runs.
        
        Args:
            data (pd.DataFrame): Panel data with ('PERMNO', 'date') MultiIndex
            target_column (str): Name of the target variable for forecasting
            iterations (int, optional): Number of improvement iterations. Defaults to 3.
            
        Returns:
            List[Dict[str, Any]]: List of workflow results for each iteration,
                                enabling performance comparison across iterations
        """

        results = []
        
        for i in range(iterations):
            print(f"\n=== Iteration {i+1}/{iterations} ===")
            result = self.execute_communicating_workflow(data, target_column)
            results.append(result)
            
            # Print progress
            if 'forecast_results' in result and 'validation_results' in result['forecast_results']:
                metrics = result['forecast_results']['validation_results'].get('test_metrics', {})
                print(f"Iteration {i+1} - R2: {metrics.get('R2', 'N/A'):.3f}, MAE: {metrics.get('MAE', 'N/A'):.3f}")
        
        return results


#### Run the framework

In [19]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from unittest.mock import patch

# 1) Build synthetic panel data
dates = pd.bdate_range('2022-01-03', periods=40)
permnos = [10001, 10002, 10003]
rows = []
for p in permnos:
    prc = np.linspace(10, 12, len(dates)) + np.random.normal(0, 0.2, len(dates))
    vol = np.random.randint(1e5, 2e5, len(dates))
    ret = np.random.normal(0.0005, 0.01, len(dates))
    # Bi-weekly-ish target: step function that changes every 10 biz days
    steps = (np.arange(len(dates)) // 10)
    short_qty = 1_000 + 50 * steps + np.random.randint(-5, 5, len(dates))
    for d, a, b, c, t in zip(dates, prc, vol, ret, short_qty):
        rows.append((p, d, a, b, c, t))

panel_df = pd.DataFrame(rows, columns=['PERMNO', 'date', 'PRC', 'VOL', 'RET', 'currentShortPositionQuantity'])
panel_df = panel_df.set_index(['PERMNO', 'date']).sort_index()

# 2) Construct agents/orchestrator (ensure corrected class name usage)
feature_engineer = CommunicatingFeatureEngineerLLM(model_name=None)  # LLM will be patched
forecaster = CommunicatingForecastingAgent(model_name=None)
orchestrator = CommunicatingOrchestrator(feature_engineer, forecaster)

# 3) Patch the LLM call to deterministic JSON
fake_strategy = """
{
  "lag_features": [1, 2, 7, 14],
  "rolling_features": ["mean_7", "std_14"],
  "interaction_features": [],
  "time_features": ["month", "quarter", "day_of_week"],
  "reasoning": "deterministic-mock"
}
"""

with patch.object(CommunicatingFeatureEngineerLLM, "_call_llm", return_value=fake_strategy):
    result = orchestrator.execute_communicating_workflow(
        panel_df, "currentShortPositionQuantity"
    )

# 4) Inspect key outputs (asserts in a unit test)
print(result["forecast_results"]["optimization_results"]["best_params"])
print(result["forecast_results"]["cross_validation_metrics"]["average_metrics"])
print(result["engineered_features"]["strategy"])
print(result["communication_log"][:3])


[I 2025-09-15 21:05:13,385] A new study created in memory with name: no-name-704aea05-1ef6-4753-9ee1-14da67a177eb


Starting communicating workflow (ID: 110d53e6...)
Feature Engineering Agent working...
Forecasting Agent working...
Starting XGBoost optimization and forecasting...
Training data: 105 samples, 30 features
Running 20 optimization trials...


[I 2025-09-15 21:05:13,725] Trial 0 finished with value: -13.255250295003256 and parameters: {'n_estimators': 106, 'max_depth': 6, 'learning_rate': 0.15979909127171077, 'subsample': 0.8795975452591109, 'colsample_bytree': 0.7468055921327309, 'reg_alpha': 0, 'reg_lambda': 0, 'min_child_weight': 1}. Best is trial 0 with value: -13.255250295003256.
[I 2025-09-15 21:05:14,235] Trial 1 finished with value: -13.090790589650473 and parameters: {'n_estimators': 180, 'max_depth': 5, 'learning_rate': 0.15621088666940683, 'subsample': 0.7061753482887407, 'colsample_bytree': 0.9909729556485983, 'reg_alpha': 0, 'reg_lambda': 1, 'min_child_weight': 2}. Best is trial 1 with value: -13.090790589650473.
[I 2025-09-15 21:05:14,395] Trial 2 finished with value: -12.013821919759115 and parameters: {'n_estimators': 77, 'max_depth': 3, 'learning_rate': 0.09563633644393067, 'subsample': 0.8574269294896714, 'colsample_bytree': 0.8295835055926347, 'reg_alpha': 0, 'reg_lambda': 0, 'min_child_weight': 4}. Best i

Best parameters found: {'n_estimators': 137, 'max_depth': 5, 'learning_rate': 0.19753690038993088, 'subsample': 0.9625659107838825, 'colsample_bytree': 0.8934598467451235, 'reg_alpha': 0, 'reg_lambda': 1, 'min_child_weight': 3}
Tracked performance for conversation 110d53e6...
{'n_estimators': 137, 'max_depth': 5, 'learning_rate': 0.19753690038993088, 'subsample': 0.9625659107838825, 'colsample_bytree': 0.8934598467451235, 'reg_alpha': 0, 'reg_lambda': 1, 'min_child_weight': 3}
{'avg_MAE': np.float64(10.702038764953613), 'std_MAE': np.float64(4.53178480287444), 'avg_RMSE': np.float64(17.5551606934487), 'std_RMSE': np.float64(4.643364322744539), 'avg_MAPE': np.float64(0.9703525026481841), 'std_MAPE': np.float64(0.4145505111951835), 'avg_R2': np.float64(0.8755619938354805), 'std_R2': np.float64(0.0738505233524686)}
{'lag_features': [1, 2, 7, 14], 'rolling_features': ['mean_7', 'std_14'], 'interaction_features': [], 'time_features': ['month', 'quarter', 'day_of_week'], 'reasoning': 'determ