# Setup

In [1]:
from imports import *
from utils import *
from agents import UrnAgent, QLearningAgent, TDLearningAgent
from environment import NetMultiAgentEnv, TempNetMultiAgentEnv
from simulation_function import simulation_function, temp_simulation_function

In [2]:
def analyze_results(search_results, top_n=5, sort_by='mean_reward'):
    """
    Takes parameter search results and shows the top configurations and summaries.
    """
    # Convert to DataFrame
    df = pd.DataFrame([
        {**res['params'], 'mean_reward': res['mean_reward'], 'std_reward': res['std_reward'], 'mean_final_nmi': res.get('mean_final_nmi', None)}
        for res in search_results
    ])

    # Sort and show top-N
    top_df = df.sort_values(by=sort_by, ascending=False).head(top_n)
    print(f"🔝 Top {top_n} Configurations by {sort_by}:\n")
    print(top_df.to_string(index=False))

    return df, top_df

In [3]:
def plot_param_sensitivity(df, reward_col='mean_reward', error_col='std_reward', nmi_col='mean_final_nmi'):
    import matplotlib.pyplot as plt

    param_cols = [col for col in df.columns if col not in [reward_col, error_col, nmi_col]]

    num_params = len(param_cols)
    fig, axes = plt.subplots(num_params, 1, figsize=(8, 4 * num_params))

    if num_params == 1:
        axes = [axes]

    for i, param in enumerate(param_cols):
        ax = axes[i]
        grouped = df.groupby(param).agg({reward_col: 'mean', error_col: 'mean', nmi_col: 'mean'}).reset_index()

        ax.errorbar(grouped[param], grouped[reward_col], yerr=grouped[error_col], fmt='o-', capsize=5, label='Mean Reward')
        ax.set_xlabel(param)
        ax.set_ylabel('Mean Reward')
        ax.set_title(f'Effect of {param} on Reward')

        if nmi_col in df.columns:
            for j, row in grouped.iterrows():
                ax.annotate(f"NMI={row[nmi_col]:.2f}", (row[param], row[reward_col]), fontsize=8)

        ax.grid(True)

    plt.tight_layout()
    plt.show()

def plot_pairwise_performance(results):
    df = pd.DataFrame([
        {**r['params'], 'mean_reward': r['mean_reward'], 'mean_final_nmi': r['mean_final_nmi']}
        for r in results
    ])
    sns.pairplot(df, diag_kind='kde', corner=True,
                 plot_kws={'alpha': 0.7},
                 hue=None)
    plt.suptitle('Pairwise Parameter Exploration (Reward & NMI)', y=1.02)
    plt.show()

# QLearning Agent

In [4]:
def parameter_search(
    param_ranges,
    n_simulations=20,
    n_trials=10,
    n_episodes=5000,
    base_seed=42
):
    results = []

    for sim_id in range(n_simulations):
        seed = base_seed + sim_id
        np.random.seed(seed)
        random.seed(seed)

        # Sample one parameter set from ranges
        params = {
            k: random.uniform(*v) for k, v in param_ranges.items()
        }

        trial_rewards = []
        trial_nmis = []

        for trial in range(n_trials):
            trial_seed = seed + trial * 1000
            np.random.seed(trial_seed)
            random.seed(trial_seed)

            # Setup graph
            G = nx.DiGraph()
            G.add_nodes_from([0, 1])
            G.add_edges_from([(0, 1), (1, 0)])

            # Setup game and env
            n_agents = 2
            n_features = 2
            n_signaling_actions = 2
            n_final_actions = 4
            agents_observed_variables = {0: [0], 1: [1]}
            game = {i: create_random_canonical_game(n_features, n_final_actions, n=1, m=0)
                    for i in range(n_agents)}

            env = NetMultiAgentEnv(
                n_agents=n_agents,
                n_features=n_features,
                n_signaling_actions=n_signaling_actions,
                n_final_actions=n_final_actions,
                full_information=False,
                game_dicts=game,
                observed_variables=agents_observed_variables,
                agent_type=QLearningAgent,
                initialize=False,
                graph=G
            )

            # Override agents manually with hyperparameters
            env.agents = [
                QLearningAgent(
                    n_signaling_actions=n_signaling_actions,
                    n_final_actions=n_final_actions,
                    exploration_rate=params['exploration_rate'],
                    exploration_decay=params['exploration_decay'],
                    min_exploration_rate=params['min_exploration_rate'],
                    initialize=False
                ) for _ in range(n_agents)
            ]

            _, rewards_history, signal_information_history, _, _ = simulation_function(
                n_agents=n_agents,
                n_features=n_features,
                n_signaling_actions=n_signaling_actions,
                n_final_actions=n_final_actions,
                n_episodes=n_episodes,
                with_signals=True,
                plot=False,
                env=env,
                verbose=False
            )

            # Measure average reward in last 10% of episodes
            final_rewards = [
                np.mean(rewards[-n_episodes // 10:]) for rewards in rewards_history
            ]
            trial_rewards.append(np.mean(final_rewards))

            # Measure final normalized mutual information
            final_nmi = [
                np.mean(agent_nmi[-n_episodes // 10:]) if len(agent_nmi) >= n_episodes // 10 else 0.0
                for agent_nmi in signal_information_history
            ]
            trial_nmis.append(np.mean(final_nmi))

        result = {
            'params': params,
            'mean_reward': np.mean(trial_rewards),
            'std_reward': np.std(trial_rewards),
            'mean_final_nmi': np.mean(trial_nmis)
        }
        results.append(result)
        print(f"Simulation {sim_id + 1}: {params} => Mean Reward: {result['mean_reward']:.3f}, Mean NMI: {result['mean_final_nmi']:.3f}, Std Reward: {result['std_reward']:.3f}")

    # Save to CSV
    df = pd.DataFrame([
        {**r['params'], 'mean_reward': r['mean_reward'], 'std_reward': r['std_reward'], 'mean_final_nmi': r['mean_final_nmi']}
        for r in results
    ])
    save_path = f"td_search_results_{int(time.time())}.csv"
    df.to_csv(save_path, index=False)
    print(f"Results saved to {save_path}")

    return sorted(results, key=lambda r: -r['mean_reward'])


# Example usage
param_ranges = {
    'exploration_rate': (0.5, 1.0),
    'exploration_decay': (0.95, 1.0),
    'min_exploration_rate': (0.0001, 0.05)
}

q_search_results = parameter_search(param_ranges, n_simulations=100, n_trials=100, n_episodes=5000)

for r in q_search_results:
    print(r)

Simulation 1: {'exploration_rate': 0.8197133992289418, 'exploration_decay': 0.9512505377611333, 'min_exploration_rate': 0.01382396298661905} => Mean Reward: 0.716, Mean NMI: 0.410, Std Reward: 0.161
Simulation 2: {'exploration_rate': 0.51927591966869, 'exploration_decay': 0.9848112161318526, 'min_exploration_rate': 0.007282267747628515} => Mean Reward: 0.723, Mean NMI: 0.442, Std Reward: 0.172
Simulation 3: {'exploration_rate': 0.7042679396272469, 'exploration_decay': 0.9770986004552298, 'min_exploration_rate': 0.043115627162315734} => Mean Reward: 0.723, Mean NMI: 0.392, Std Reward: 0.159
Simulation 4: {'exploration_rate': 0.6359377071920453, 'exploration_decay': 0.9744014103925454, 'min_exploration_rate': 0.004185779184321212} => Mean Reward: 0.717, Mean NMI: 0.502, Std Reward: 0.175
Simulation 5: {'exploration_rate': 0.944134038226244, 'exploration_decay': 0.9699997197985506, 'min_exploration_rate': 0.02947214712043418} => Mean Reward: 0.706, Mean NMI: 0.366, Std Reward: 0.173
Simul

KeyboardInterrupt: 

In [None]:
plot_pairwise_performance(q_search_results)

In [None]:
full_df, top_configs = analyze_results(q_search_results, top_n=15)
top_k_df = full_df.sort_values(by='mean_reward', ascending=False).head(15)
plot_param_sensitivity(top_k_df)

# TD Agent

In [None]:
def parameter_search(
    param_ranges,
    n_simulations=200,
    n_trials=100,
    n_episodes=5000,
    base_seed=42
):
    results = []

    for sim_id in range(n_simulations):
        seed = base_seed + sim_id
        np.random.seed(seed)
        random.seed(seed)

        # Sample one parameter set from ranges
        params = {
            k: random.uniform(*v) for k, v in param_ranges.items()
        }

        trial_rewards = []
        trial_nmis = []

        for trial in range(n_trials):
            trial_seed = seed + trial * 1000
            np.random.seed(trial_seed)
            random.seed(trial_seed)

            # Setup graph
            G = nx.DiGraph()
            G.add_nodes_from([0, 1])
            G.add_edges_from([(0, 1), (1, 0)])

            # Setup game and env
            n_agents = 2
            n_features = 2
            n_signaling_actions = 2
            n_final_actions = 4
            agents_observed_variables = {0: [0], 1: [1]}
            game = {i: create_random_canonical_game(n_features, n_final_actions, n=1, m=0)
                    for i in range(n_agents)}

            env = TempNetMultiAgentEnv(
                n_agents=n_agents,
                n_features=n_features,
                n_signaling_actions=n_signaling_actions,
                n_final_actions=n_final_actions,
                full_information=False,
                game_dicts=game,
                observed_variables=agents_observed_variables,
                agent_type=TDLearningAgent,
                graph=G
            )

            # Override agents manually with hyperparameters
            env.agents = [
                TDLearningAgent(
                    n_actions=env.max_actions,
                    learning_rate=params['learning_rate'],
                    exploration_rate=params['exploration_rate'],
                    exploration_decay=params['exploration_decay'],
                    min_exploration_rate=params['min_exploration_rate']
                ) for _ in range(n_agents)
            ]

            _, rewards_history, signal_information_history, _, _ = temp_simulation_function(
                n_agents=n_agents,
                n_features=n_features,
                n_signaling_actions=n_signaling_actions,
                n_final_actions=n_final_actions,
                n_episodes=n_episodes,
                with_signals=True,
                plot=False,
                env=env,
                verbose=False
            )

            # Measure average reward in last 10% of episodes
            final_rewards = [
                np.mean(rewards[-n_episodes // 10:]) for rewards in rewards_history
            ]
            trial_rewards.append(np.mean(final_rewards))

            # Measure final normalized mutual information
            final_nmi = [
                np.mean(agent_nmi[-n_episodes // 10:]) if len(agent_nmi) >= n_episodes // 10 else 0.0
                for agent_nmi in signal_information_history
            ]
            trial_nmis.append(np.mean(final_nmi))

        result = {
            'params': params,
            'mean_reward': np.mean(trial_rewards),
            'std_reward': np.std(trial_rewards),
            'mean_final_nmi': np.mean(trial_nmis)
        }
        results.append(result)
        print(f"Simulation {sim_id + 1}: {params} => Mean Reward: {result['mean_reward']:.3f}, Mean NMI: {result['mean_final_nmi']:.3f}, Std Reward: {result['std_reward']:.3f}")

    # Save to CSV
    df = pd.DataFrame([
        {**r['params'], 'mean_reward': r['mean_reward'], 'std_reward': r['std_reward'], 'mean_final_nmi': r['mean_final_nmi']}
        for r in results
    ])
    save_path = f"td_search_results_{int(time.time())}.csv"
    df.to_csv(save_path, index=False)
    print(f"Results saved to {save_path}")
    
    return sorted(results, key=lambda r: -r['mean_reward'])

def plot_pairwise_performance(results):
    df = pd.DataFrame([
        {**r['params'], 'mean_reward': r['mean_reward'], 'mean_final_nmi': r['mean_final_nmi']}
        for r in results
    ])
    sns.pairplot(df, diag_kind='kde', corner=True,
                 plot_kws={'alpha': 0.7},
                 hue=None)
    plt.suptitle('Pairwise Parameter Exploration (Reward & NMI)', y=1.02)
    plt.show()

# Example usage
param_ranges = {
    'learning_rate': (0.01, 0.2),
    'exploration_rate': (0.25, 1.0),
    'exploration_decay': (0.9, 1.0),
    'min_exploration_rate': (0.0, 0.05)
}

td_search_results = parameter_search(param_ranges, n_simulations=200, n_trials=100, n_episodes=5000)


for r in td_search_results:
    print(r)

In [None]:
plot_pairwise_performance(td_search_results)


In [None]:
full_df, top_configs = analyze_results(td_search_results, top_n=15)
top_k_df = full_df.sort_values(by='mean_reward', ascending=False).head(15)
plot_param_sensitivity(top_k_df)
