In [None]:
import sys
import numpy as np
import gymnasium as gym

In [None]:
pip install -r /path/to/rl_papers/rl-rtb-papers/requirements.txt

Specify the path to these folders.

Make sure you change the path in the script below:

/path/to/rl_papers/rl-rtb-papers/scripts/gym-rtb/rtb_env/envs/config.cfg

In [None]:
path_to_env = '/path/to/rl-rtb-papers/scripts/gym-rtb'
path_to_agents = '/path/to/rl-rtb-papers/scripts/agents/'

In [None]:
sys.path.append(path_to_env)
sys.path.append(path_to_agents)

In [None]:
import rtb_env

from constant_bid_agent import ConstantBidAgent
from random_bid_agent import RandomBidAgent
from linear_bid_agent import LinearBidAgent
from random_part_bid_agent import RandomParticipationBidAgent
from budget_pacing_bid_agent import BudgetPacingAgent

In [None]:
from epsilon_bid_agent import EpsilonGreedyBidAgent
from thompson_bid_agent import ThompsonSamplingBidAgent
from ucb_bid_agent import UCBBidAgent
from q_learning_bid_agent import QLearningBidAgent

In [None]:
from dqn_agent.dqn_agent import DQNBidAgent
from ddpg_agent.ddpg_agent import DDPGBidAgent
from td3_agent.td3_agent import TD3BidAgent
from sac_agent.sac_agent import SACBidAgent

Setting up the environment

ENV - environment with standardized preprocessed toy-dataset with openRTB requests

In [None]:
ENV = gym.make(
    id='RTBEnv-v0',
    disable_env_checker=True,
    num_agents=18
)

ENV.env.auction_type

In [None]:
ENV.reset()

In [None]:
df = ENV.env.bid_requests

In [None]:
df.shape

Set the initial budget, the base bid and the minimum possible bid

The budget is set so that the agent, who places a constant bid - can buy all the desired impressions

In [None]:
bidfloor, second_price = df['slotprice'].values, df['payprice'].values
total_clicks = df['click'].sum()

base_bid = int(df['bidprice'].max())
budget = df.shape[0] * base_bid
min_cpm = int(df['bidprice'].max() * 0.6)

print('base bid:', base_bid)
print('budget:', budget)
print('min CPM:', min_cpm)

In [None]:
df.shape[0]

Initialize agents and run experiments

In [None]:
SEED = 43

In [None]:
ddpg_bid_agent = DDPGBidAgent(
    base_bid=base_bid,
    budget=budget,
    seed=SEED
)

td3_bid_agent = TD3BidAgent(
    base_bid=base_bid,
    budget=budget,
    seed=SEED
)

sac_bid_agent = SACBidAgent(
    base_bid=base_bid,
    budget=budget,
    seed=SEED
)

In [None]:
eps_agent = EpsilonGreedyBidAgent(
    base_bid=base_bid,
    budget=budget,
    seed=SEED
)

thomspon_agent = ThompsonSamplingBidAgent(
    base_bid=base_bid,
    budget=budget
)

ucb_agent = UCBBidAgent(
    base_bid=base_bid,
    budget=budget
)

dqn_agent = DQNBidAgent(
    base_bid=base_bid,
    budget=budget,
    seed=SEED
)

In [None]:
random_bid_agent = RandomBidAgent(
    base_bid=base_bid,
    min_cpm=min_cpm,
    budget=budget,
    seed=SEED
)

linear_bid_agent = LinearBidAgent(
    base_bid=base_bid,
    budget=budget
)

random_part_agent = RandomParticipationBidAgent(
    base_bid=base_bid * 0.8,
    budget=budget,
    seed=SEED
)

budget_pacing_agent = BudgetPacingAgent(
    base_bid=base_bid * 0.8,
    budget=budget,
    total_steps=df['timestamp'].nunique()
)

q_learning_agent_exp = QLearningBidAgent(
    base_bid=base_bid,
    budget=budget,
    seed=SEED
)

q_learning_agent_ucb = QLearningBidAgent(
    base_bid=base_bid,
    budget=budget,
    seed=SEED,
    exploration_strategy='ucb'
)

q_learning_agent_bolt = QLearningBidAgent(
    base_bid=base_bid,
    budget=budget,
    seed=SEED,
    exploration_strategy='boltzmann'
)

q_learning_agent_pursuit = QLearningBidAgent(
    base_bid=base_bid,
    budget=budget,
    seed=SEED,
    exploration_strategy='pursuit'
)

In [None]:
agents = [
    ConstantBidAgent(base_bid=250, budget=budget),
    ConstantBidAgent(base_bid=240, budget=budget),
    ConstantBidAgent(base_bid=230, budget=budget),
    dqn_agent,
    random_bid_agent,
    linear_bid_agent,
    random_part_agent,
    budget_pacing_agent,
    q_learning_agent_exp,
    q_learning_agent_ucb,
    q_learning_agent_bolt,
    q_learning_agent_pursuit,
    eps_agent,
    thomspon_agent,
    ucb_agent,
    ddpg_bid_agent,
    td3_bid_agent,
    sac_bid_agent
]

In [None]:
len(agents)

In [None]:
obs, reward, cost, done = ENV.reset()
done = False
all_actions, bidfloors, real_prices = [], [], []

while not done:
    actions = []

    for request_obs in obs:
        action_list, bidfloor_list, real_price_list = [], [], []

        for agent in agents:
            action = agent.act(request_obs, reward, cost)
            action_list.append(action)
            bidfloor_list.append(request_obs.get('slotprice'))
            real_price_list.append(request_obs.get('payprice'))

        actions.append(action_list)
        all_actions.append(action_list)
        bidfloors.append(bidfloor_list)
        real_prices.append(real_price_list)

    next_obs, rewards, costs, done = ENV.step(actions)

    for request_idx in range(len(next_obs)):
        request_obs = next_obs[request_idx]
        request_rewards = rewards[request_idx]
        request_costs = costs[request_idx]

        for agent_idx, agent in enumerate(agents):
            if agent_idx < len(request_rewards) and agent_idx < len(request_costs):
                reward_val = request_rewards[agent_idx]
                cost_val = request_costs[agent_idx]
                agent.update(request_obs, reward_val, cost_val)
            else:
                print(f"Error: there is no rewards/costs data for agent {agent_idx} and request {request_idx}")
                
    ENV.env.render_frame()
    obs = next_obs

    if np.any([agent.done() for agent in agents]) or ENV.env._total_steps == ENV.env._block_index:
        print('Simulation completed')
        break

ENV.env.close()

# Quick stats

In [None]:
clicks, wins, bids = 0, 0, 0

for agent in agents:
    print(
        '\n',
        agent, 
        '\nbids:', agent.total_bids, 
        '\nwins:', agent.total_wins,
        '\nclicks:', agent.total_clicks, 
        '\nCTR:', round(agent.total_clicks / agent.total_wins if agent.total_wins else 0 * 100, 2),'%',
        '\npart of total amount of clicks:', round(agent.total_clicks / df['click'].sum() * 100, 2), '%',
        '\npart of total amount of clicks:', round(agent.total_clicks / df.shape[0] * 100, 2), '%',
        '\nrewards:', round(agent.total_rewards, 2), 
        '\nremaining budget:', round(((budget - agent.total_budget_spend) / budget * 100), 2), '%'
    )
    clicks += agent.total_clicks
    wins += agent.total_wins
    bids += agent.total_bids

print('\ntotal bids:', bids, 'total wins:', wins, 'total clicks:', clicks,)
print('ref bids:', df.shape[0], 'ref wins:', df.shape[0], 'ref clicks:', df['click'].sum())

## Visualization

In [None]:
import matplotlib.pyplot as plt

from matplotlib.ticker import PercentFormatter

# Bids distribution per agent

In [None]:
n_agents = len(agents)
agent_bids = [[] for _ in range(n_agents)]

for request in all_actions:
    for agent_idx in range(n_agents):
        agent_bids[agent_idx].append(request[agent_idx])
        
agent_bids = [np.array(bids) for bids in agent_bids]

plt.figure(
    figsize=(18, 6),
    dpi=125
)

for i in range(n_agents):
    plt.hist(
        x=agent_bids[i], 
        bins=50, 
        alpha=0.5, 
        label=f'Agent {i}'
    )
plt.title("Agent's bid distribution", fontsize=14)
plt.xlabel('Bid value', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(loc='upper left')
plt.grid(True)
plt.tight_layout()

# Bids distribution per agent (box plot)

In [None]:
plt.figure(
    figsize=(18, 6),
    dpi=125
)
plt.boxplot(
    x=[agent_bids[i] for i in range(n_agents)],
    labels=[f'Agent {i}' for i in range(n_agents)]
)
plt.ylim(0, base_bid * 1.2)
plt.title("Agent's bid distribution", fontsize=14)
plt.xlabel('Agents', fontsize=12)
plt.ylabel('Bid value', fontsize=12)
plt.grid(True)
plt.tight_layout()

# Bids vs bidfloor

In [None]:
bidfloor_per_request = np.array([sublist[0] for sublist in bidfloors])
real_prices_per_request = np.array([sublist[0] for sublist in real_prices])
above_bidfloor = [np.mean(bids > bidfloor_per_request) * 100 for bids in agent_bids]
above_payprice = [np.mean(bids > real_prices_per_request) * 100 for bids in agent_bids]
x = np.arange(n_agents)

plt.figure(
    figsize=(18, 6),
    dpi=125
)
plt.bar(
    x=x-0.2, 
    height=above_bidfloor, 
    width=0.4, 
    label='Above bidfloor'
)
plt.bar(
    x=x+0.2, 
    height=above_payprice, 
    width=0.4, 
    label='Above payprice'
)
plt.xticks(x, [f'Agent {i}' for i in range(n_agents)])
plt.title('Сomparison of the effectiveness of strategies', fontsize=14)
plt.xlabel('Agents', fontsize=12)
plt.ylabel('Proportion of cases, %', fontsize=12)
plt.legend(loc='upper right')
plt.grid(True)
plt.tight_layout()

# Clicks plot

In [None]:
clicks = [agent.total_clicks for agent in agents]

plt.figure(
    figsize=(18, 6),
    dpi=125
)
plt.bar(
    x=range(len(agents)), 
    height=clicks, 
    label='Agents'
)
plt.axhline(
    y=df['click'].sum(), 
    color='r', 
    linestyle='--', 
    label='Total amount'
)
plt.xticks(
    ticks=range(len(agents)), 
    labels=[f'Agent {i}' for i in range(len(agents))]
)
plt.ylim(0, df['click'].sum() * 1.2)
plt.title('Total Clicks', fontsize=14)
plt.xlabel('Agent', fontsize=12)
plt.ylabel('Clicks count', fontsize=12)
plt.legend(loc = 'upper right')
plt.tight_layout()

# Remaining budget

In [None]:
budget_left = [(budget - agent.total_budget_spend) / budget * 100 for agent in agents]

plt.figure(
    figsize=(18, 6),
    dpi=125
)
plt.bar(
    x=range(len(agents)), 
    height=budget_left
)
plt.gca().yaxis.set_major_formatter(PercentFormatter())
plt.title('Remaining Budget, %', fontsize=14)
plt.xlabel('Agent', fontsize=12)
plt.ylabel('Budget share, %', fontsize=12)
plt.xticks(range(len(agents)), [f'Agent {i}' for i in range(len(agents))])
plt.ylim(0, 100)
plt.tight_layout()

# Share of total metrics

In [None]:
total_bids = sum([agent.total_bids for agent in agents])
total_wins = sum([agent.total_wins for agent in agents])
total_clicks = sum([agent.total_clicks for agent in agents])

bid_shares = [agent.total_bids / total_bids * 100 if total_bids > 0 else 0 for agent in agents]
win_shares = [agent.total_wins / total_wins * 100 if total_wins > 0 else 0 for agent in agents]
click_shares = [agent.total_clicks / total_clicks * 100 if total_clicks > 0 else 0 for agent in agents]

bar_width = 0.2

x = np.arange(n_agents)

plt.figure(
    figsize=(18, 6), 
    dpi=125
)

plt.bar(
    x=x - bar_width, 
    height=bid_shares, 
    width=bar_width, 
    label='Bids'
)
plt.bar(
    x=x, 
    height=win_shares, 
    width=bar_width, 
    label='Wins'
)
plt.bar(
    x=x + bar_width, 
    height=click_shares, 
    width=bar_width, 
    label='Clicks'
)

plt.gca().yaxis.set_major_formatter(PercentFormatter())
plt.title('Proportional comparison of bids, wins and clicks by agent', fontsize=14)
plt.xlabel('Agents', fontsize=12)
plt.ylabel('Share, %', fontsize=12)
plt.xticks(x, [f'Agent {i}' for i in range(n_agents)])
plt.ylim(0, min(100, max(max(bid_shares), max(win_shares), max(click_shares)) + 10))
plt.legend(loc='upper right')
plt.tight_layout()
plt.show()