# Setup

In [1]:
from imports import *
from utils import *
from agents import UrnAgent, QLearningAgent, TDLearningAgent
from environment import NetMultiAgentEnv, TempNetMultiAgentEnv
from simulation_function import simulation_function, temp_simulation_function

# QLearning Agent

In [None]:
def parameter_search(
    param_grid,
    n_trials=3,
    n_episodes=10000,
    base_seed=42
):
    results = []

    # Generate all combinations of hyperparameters
    keys = list(param_grid.keys())
    combos = list(itertools.product(*[param_grid[k] for k in keys]))

    for combo in combos:
        params = dict(zip(keys, combo))
        avg_rewards = []
        avg_final_nmi = []

        for trial in range(n_trials):
            seed = base_seed + trial
            np.random.seed(seed)

            # Setup graph
            G = nx.DiGraph()
            G.add_nodes_from([0, 1])
            G.add_edges_from([(0, 1), (1, 0)])

            # Setup game and env
            n_agents = 2
            n_features = 2
            n_signaling_actions = 2
            n_final_actions = 4
            agents_observed_variables = {0: [0], 1: [1]}
            game = {i: create_random_canonical_game(n_features, n_final_actions, n=1, m=0)
                    for i in range(n_agents)}

            env = NetMultiAgentEnv(
                n_agents=n_agents,
                n_features=n_features,
                n_signaling_actions=n_signaling_actions,
                n_final_actions=n_final_actions,
                full_information=False,
                game_dicts=game,
                observed_variables=agents_observed_variables,
                agent_type=QLearningAgent,
                initialize=False,
                graph=G
            )

            # Override agents manually with hyperparameters
            env.agents = [
                QLearningAgent(
                    n_signaling_actions=n_signaling_actions,
                    n_final_actions=n_final_actions,
                    exploration_rate=params['exploration_rate'],
                    exploration_decay=params['exploration_decay'],
                    min_exploration_rate=params['min_exploration_rate'],
                    initialize=False
                ) for _ in range(n_agents)
            ]

            _, rewards_history, signal_information_history, _, _ = simulation_function(
                n_agents=n_agents,
                n_features=n_features,
                n_signaling_actions=n_signaling_actions,
                n_final_actions=n_final_actions,
                n_episodes=n_episodes,
                with_signals=True,
                plot=False,
                env=env,
                verbose=False
            )

            # Measure average reward in last 10% of episodes
            final_rewards = [
                np.mean(rewards[-n_episodes // 10:]) for rewards in rewards_history
            ]
            avg_rewards.append(np.mean(final_rewards))

            # Measure final normalized mutual information
            final_nmi = [
                np.mean(agent_nmi[-n_episodes // 10:]) if len(agent_nmi) >= n_episodes // 10 else 0.0
                for agent_nmi in signal_information_history
            ]
            avg_final_nmi.append(np.mean(final_nmi))

        result = {
            'params': params,
            'mean_reward': np.mean(avg_rewards),
            'std_reward': np.std(avg_rewards),
            'mean_final_nmi': np.mean(avg_final_nmi)
        }
        results.append(result)
        print(f"Tested: {params} => Mean Final Reward: {result['mean_reward']:.3f}, Mean Final NMI: {result['mean_final_nmi']:.3f}")

    return sorted(results, key=lambda r: -r['mean_reward'])


# Example usage
param_grid = {
    'exploration_rate': [1.0, 0.5],
    'exploration_decay': [0.995, 0.90, 0.9],
    'min_exploration_rate': [0.001, 0.0001]
}

search_results = parameter_search(param_grid, n_trials=100, n_episodes=5000)

for r in search_results:
    print(r)


Tested: {'exploration_rate': 1.0, 'exploration_decay': 0.995, 'min_exploration_rate': 0.001} => Mean Final Reward: 0.999, Mean Final NMI: 0.837
Tested: {'exploration_rate': 1.0, 'exploration_decay': 0.995, 'min_exploration_rate': 0.01} => Mean Final Reward: 0.745, Mean Final NMI: 0.374
Tested: {'exploration_rate': 1.0, 'exploration_decay': 0.99, 'min_exploration_rate': 0.001} => Mean Final Reward: 0.754, Mean Final NMI: 0.446
Tested: {'exploration_rate': 1.0, 'exploration_decay': 0.99, 'min_exploration_rate': 0.01} => Mean Final Reward: 0.986, Mean Final NMI: 0.888
{'params': {'exploration_rate': 1.0, 'exploration_decay': 0.995, 'min_exploration_rate': 0.001}, 'mean_reward': 0.999, 'std_reward': 0.0, 'mean_final_nmi': 0.8365360108951392}
{'params': {'exploration_rate': 1.0, 'exploration_decay': 0.99, 'min_exploration_rate': 0.01}, 'mean_reward': 0.9855, 'std_reward': 0.0015000000000000013, 'mean_final_nmi': 0.8876236179061014}
{'params': {'exploration_rate': 1.0, 'exploration_decay': 0

# TD Agent

In [7]:
def parameter_search(
    param_grid,
    n_trials=3,
    n_episodes=10000,
    base_seed=42
):
    results = []

    # Generate all combinations of hyperparameters
    keys = list(param_grid.keys())
    combos = list(itertools.product(*[param_grid[k] for k in keys]))

    for combo in combos:
        params = dict(zip(keys, combo))
        avg_rewards = []
        avg_final_nmi = []

        for trial in range(n_trials):
            seed = base_seed + trial
            np.random.seed(seed)

            # Setup graph
            G = nx.DiGraph()
            G.add_nodes_from([0, 1])
            G.add_edges_from([(0, 1), (1, 0)])

            # Setup game and env
            n_agents = 2
            n_features = 2
            n_signaling_actions = 2
            n_final_actions = 4
            agents_observed_variables = {0: [0], 1: [1]}
            game = {i: create_random_canonical_game(n_features, n_final_actions, n=1, m=0)
                    for i in range(n_agents)}

            env = TempNetMultiAgentEnv(
                n_agents=n_agents,
                n_features=n_features,
                n_signaling_actions=n_signaling_actions,
                n_final_actions=n_final_actions,
                full_information=False,
                game_dicts=game,
                observed_variables=agents_observed_variables,
                agent_type=TDLearningAgent,
                graph=G
            )

            # Override agents manually with hyperparameters
            env.agents = [
                TDLearningAgent(
                    n_actions=env.max_actions,
                    learning_rate=params['learning_rate'],
                    exploration_rate=params['exploration_rate'],
                    exploration_decay=params['exploration_decay'],
                    min_exploration_rate=params['min_exploration_rate']
                ) for _ in range(n_agents)
            ]

            _, rewards_history, signal_information_history, _, _ = temp_simulation_function(
                n_agents=n_agents,
                n_features=n_features,
                n_signaling_actions=n_signaling_actions,
                n_final_actions=n_final_actions,
                n_episodes=n_episodes,
                with_signals=True,
                plot=False,
                env=env,
                verbose=False
            )

            # Measure average reward in last 10% of episodes
            final_rewards = [
                np.mean(rewards[-n_episodes // 10:]) for rewards in rewards_history
            ]
            avg_rewards.append(np.mean(final_rewards))

            # Measure final normalized mutual information
            final_nmi = [
                np.mean(agent_nmi[-n_episodes // 10:]) if len(agent_nmi) >= n_episodes // 10 else 0.0
                for agent_nmi in signal_information_history
            ]
            avg_final_nmi.append(np.mean(final_nmi))

        result = {
            'params': params,
            'mean_reward': np.mean(avg_rewards),
            'std_reward': np.std(avg_rewards),
            'mean_final_nmi': np.mean(avg_final_nmi)
        }
        results.append(result)
        print(f"Tested: {params} => Mean Final Reward: {result['mean_reward']:.3f}, Mean Final NMI: {result['mean_final_nmi']:.3f}")

    return sorted(results, key=lambda r: -r['mean_reward'])


# Example usage
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01],
    'exploration_rate': [1.0, 0.5],
    'exploration_decay': [0.995, 0.90, 0.9],
    'min_exploration_rate': [0.001, 0.0001]
}

search_results = parameter_search(param_grid, n_trials=100, n_episodes=5000)

for r in search_results:
    print(r)


Tested: {'learning_rate': 0.1, 'exploration_rate': 1.0, 'exploration_decay': 0.995, 'min_exploration_rate': 0.001} => Mean Final Reward: 0.841, Mean Final NMI: 0.548
Tested: {'learning_rate': 0.1, 'exploration_rate': 1.0, 'exploration_decay': 0.995, 'min_exploration_rate': 0.0001} => Mean Final Reward: 0.754, Mean Final NMI: 0.327
Tested: {'learning_rate': 0.1, 'exploration_rate': 1.0, 'exploration_decay': 0.9, 'min_exploration_rate': 0.001} => Mean Final Reward: 0.432, Mean Final NMI: 0.180
Tested: {'learning_rate': 0.1, 'exploration_rate': 1.0, 'exploration_decay': 0.9, 'min_exploration_rate': 0.0001} => Mean Final Reward: 0.372, Mean Final NMI: 0.198
Tested: {'learning_rate': 0.1, 'exploration_rate': 1.0, 'exploration_decay': 0.9, 'min_exploration_rate': 0.001} => Mean Final Reward: 0.452, Mean Final NMI: 0.327
Tested: {'learning_rate': 0.1, 'exploration_rate': 1.0, 'exploration_decay': 0.9, 'min_exploration_rate': 0.0001} => Mean Final Reward: 0.358, Mean Final NMI: 0.273
Tested: {

KeyboardInterrupt: 