This notebook gives an example of how to train A2C, TD3, and PPO agents on AdCraft environment using the agent configurations provided in experiment_utils/experiment_configs.py

#### Load the necessary libraries

In [None]:
from adcraft.experiment_utils import experiment_configs
import adcraft.gymnasium_kw_env as kw_sim
from adcraft.experiment_utils.experiment_quantiles import (
    make_experiment_quantiles, load_experiment_quantiles)
from adcraft.experiment_utils.experiment_metrics import (
    get_implicit_kw_bid_cpc_impressions, get_max_expected_bid_profits, compute_AKNCP, compute_NCP)
from adcraft.wrappers.flat_array import FlatArrayWrapper
from adcraft.experiment_utils.experiment_configs import (dense_env_config, semi_dense_env_config, 
very_sparse_env_config, sparse_env_config, non_stationary_sparse_env_config, 
non_stationary_dense_env_config)

from ray.tune.registry import register_env
from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.torch_policy_template import build_torch_policy
from ray.rllib.utils.torch_utils import apply_grad_clipping, sequence_mask
from ray.rllib.evaluation.episode import Episode
from ray.rllib.evaluation.postprocessing import (
    compute_gae_for_sample_batch,
    Postprocessing)
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.torch_mixins import (
    EntropyCoeffSchedule,
    LearningRateSchedule,
    ValueNetworkMixin)
from ray.rllib.utils.typing import (
    LocalOptimizer,
    SpaceStruct,
    TensorStructType,
    TensorType)
from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.utils.numpy import convert_to_numpy
from ray.rllib.utils.typing import AgentID, TensorType
from ray.rllib.algorithms.ppo import PPO, PPOConfig
from ray.rllib.algorithms.a2c import A2C, A2CConfig
from ray.rllib.algorithms.TD3 import TD3, TD3Config

from adcraft.experiment_utils.agent_configs import sem_ppo_config
from adcraft.experiment_utils.agent_configs import sem_a2c_config
from adcraft.experiment_utils.agent_configs import sem_td3_config

from adcraft.experiment_utils.experiment_configs import NUM_KEYWORDS
from adcraft.experiment_utils.experiment_configs import MAX_DAYS
from adcraft.experiment_utils.experiment_configs import experiment_mode

import json
import os
import re
import shutil
import yaml
from datetime import datetime
import csv

#### Experiment configuration

In [None]:
NUM_KEYWORDS = NUM_KEYWORDS
MAX_DAYS = MAX_DAYS
experiment_type = experiment_mode
NUM_ITERATIONS = 500
evaluation_interval = 5

model = 'A2C'
is_from_checkpoint = False
is_new_config = False
new_model_config = None
checkpoint_path = ""
bound = NUM_ITERATIONS // evaluation_interval
updater_params = [0.3]

#### Environment Configuration

In [None]:
if experiment_type == "semi_dense":
    env_config = semi_dense_env_config
    volume = 64
    cvr = 0.8
    mean_volumes = [64]
    conversion_rates = [0.8]
elif experiment_type == "sparse":
    env_config = sparse_env_config
    volume = 64
    cvr = 0.1
    mean_volumes = [64]
    conversion_rates = [0.1]
elif experiment_type == "very_sparse":
    env_config = very_sparse_env_config
    volume = 16
    cvr = 0.1
    mean_volumes = [16]
    conversion_rates = [0.1]
elif experiment_type == "non_stationary_dense":
    env_config = non_stationary_dense_env_config
    volume = 128
    cvr = 0.8
    mean_volumes = [128]
    conversion_rates = [0.8]
elif experiment_type == "non_stationary_sparse":
    env_config = non_stationary_sparse_env_config
    volume = 64
    cvr = 0.1
    mean_volumes = [64]
    conversion_rates = [0.1]
else: 
    env_config = dense_env_config
    volume = 128
    cvr = 0.8
    mean_volumes = [128]
    conversion_rates = [0.8]

#### Experiment Helper Functions

In [None]:
def run_agent_for_max_days(env, agent, irs, cpcs):
    # obs, infos = env.reset(seed=env_seed)
    obs, infos = env.reset()
    action,_,_ = agent.get_policy(policy_id="default_policy").compute_single_action(obs=obs, info = infos)
    # print(env.updater_params)
    rewards = [] 
    kw_profits = []
    ideal_profits = []
    for i in range(env.max_days):
        # compute ideal profits for this timestep
        ideal_profit = []
        for kw_index, kw_params in enumerate(env.keyword_params):
            max_exp_profit, positive_proportion = get_max_expected_bid_profits(
                kw_params, cpcs[kw_index], irs[kw_index]
            )
            ideal_profit.append(max_exp_profit)
        ideal_profits.append(ideal_profit)
        previous_observation, reward,_,_,info = env.step(action)
        rewards.append(reward)
        ######## Due to the flattening instead of using the dict format of the observation we should explicitly 
        ######## slice the observation to retrive the cost and revenue arrays
        previous_observation_revenue = previous_observation[3*NUM_KEYWORDS+2:4*NUM_KEYWORDS+2]
        previous_observation_cost = previous_observation[1*NUM_KEYWORDS:2*NUM_KEYWORDS]
        kw_profits.append(previous_observation_revenue - previous_observation_cost)
        action,_,_ = agent.get_policy(policy_id="default_policy").compute_single_action(obs=previous_observation, info = info)
    
    return rewards, kw_profits, ideal_profits, compute_AKNCP(np.array(kw_profits), np.array(ideal_profits)), compute_AKNCP_mean(np.array(kw_profits), np.array(ideal_profits)), compute_NCP(np.array(kw_profits), np.array(ideal_profits))

def run_sparsity_experiments(model, agent, mean_volumes, conversion_rates, num_keywords=NUM_KEYWORDS, time_steps=MAX_DAYS):
    allowed_bids = np.arange(0.01, 3.00, 0.01)
    for volume in mean_volumes:
        for cvr in conversion_rates:
            env = FlatArrayWrapper(kw_sim.bidding_sim_creator(env_config))
            # RUN EXPERIMENT ON ENV
            results_dir = str(Path.cwd().as_posix())+"OpenMail/ds-scratch/maziar/sem-se/experiment_results/"+f"{model}_{volume}_{cvr}/"
            if not os.path.isdir(results_dir):
                os.mkdir(results_dir)  

            eps_rewards = []  
            eps_AKNCP = []
            eps_AKNCP_mean = []
            eps_NCP = []

            for env_seed in range(5,10):
                env.reset(seed=env_seed)
                irs, cpcs = [],[]
                for kw in env.keywords:
                    ir, cpc = get_implicit_kw_bid_cpc_impressions(kw, allowed_bids)
                    irs.append(ir)
                    cpcs.append(cpc)
                rewards, kw_profits, ideal_profits, AKNCP, AKNCP_mean, NCP = run_agent_for_max_days(env, agent, irs, cpcs)
                # np.savez(results_dir+f"maziar_experiment.npz", kw_profits=kw_profits, ideal_profits=ideal_profits)
                np.savez(results_dir+f"{model}_{NUM_ITERATIONS}_iteration_{experiment_type}.npz", env_seed=env_seed, kw_profits=kw_profits, ideal_profits=ideal_profits)
                eps_rewards.append(np.mean(rewards))
                eps_AKNCP.append(AKNCP)
                eps_AKNCP_mean.append(AKNCP_mean)
                eps_NCP.append(NCP)
                print(' ' + str(env_seed), end = ' ')


            values = [np.mean(np.array(eps_rewards)), sem(np.array(eps_rewards)), np.mean(np.array(eps_AKNCP)), sem(np.array(eps_AKNCP)) , np.mean(np.array(eps_AKNCP_mean)), sem(np.array(eps_AKNCP_mean)), np.mean(np.array(eps_NCP)), sem(np.array(eps_NCP))]
            pile_values_to_json(values,results_dir+f"{model}_{NUM_ITERATIONS}_iteration_{experiment_type}.json")
            print(f"vol, cvr: ({volume}, {cvr}) evaluation done!")


#### Environment Registration

In [None]:
def env_creator(env_config = env_config):
    return FlatArrayWrapper(kw_sim.bidding_sim_creator(env_config=env_config))

register_env("FlatArrayAuction", env_creator)

#### Model Instantiation

In [None]:
if is_from_checkpoint == True:
    agent = Algorithm.from_checkpoint(checkpoint_path)
    agent.restore(checkpoint_path)
    if is_new_config == True:
        agent.reset_config(new_model_config)
else: 
    if model == "A2C":
        model_config = sem_a2c_config
    elif model == "TD3":
        model_config = sem_td3_config
    else:
        model_config = sem_ppo_config
    
    agent = model_config.build()

env = kw_sim.bidding_sim_creator(env_config=env_config)
obs, infos = env.reset()


##### Run Experiment

In [None]:
for i in range(NUM_ITERATIONS):
    agent.train()
    print(f"Iteration number {str(i)} of {model} is completed")
    if ((i+1) % evaluation_interval) == 0:
        run_sparsity_experiments(model, agent, mean_volumes, conversion_rates)
results_dir = f"./{model}_{updater_params[0]}/model/"
if not os.path.isdir(results_dir):
    os.mkdir(results_dir)
checkpoint_path = agent.save(results_dir)
print("An Algorithm checkpoint has been created inside directory: "f"{checkpoint_path}.")
agent.stop()