In [9]:
import sys

sys.path.append("../")

%load_ext autoreload
%autoreload 2

import random

import numpy as np
import pandas as pd
from simulator.model.robust_mse import RobustBidMSE
from simulator.model.simple import SimpleBid
from simulator.simulation.utils_visualization import plot_metric_with_error_CTR
from simulator.validation.check_results import autobidder_check
from tqdm import tqdm

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# This notebook provides a guideline how to simulate and check Robust PID Bidder for all the campaigns in the dataset

In [10]:
loss_type = 'MSE'

# Load data

In [11]:
auction_mode = 'FPA'

campaigns_path = "../data/subsample_campaigns.csv"
stats_path = "../data/subsample_stats.csv"

In [12]:
campaign_df = pd.read_csv(campaigns_path)
stats_df = pd.read_csv(stats_path)

In [13]:
campaign_df.describe()

Unnamed: 0,campaign_id,loc_id,item_id,campaign_start,campaign_end,auction_budget,microcat_ext,logical_category,region_id
count,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0
mean,76952220.0,644788.385714,3515224000.0,529847800.0,530151400.0,1238.622143,784475.1,2.769286,644358.0
std,4957660.0,10712.608247,825097600.0,312618.7,384583.8,1906.618616,1013575.0,1.485689,10649.319309
min,59869900.0,621630.0,1073987000.0,529528300.0,529718400.0,75.84,24.0,1.0,621590.0
25%,73077960.0,637672.5,2998627000.0,529571800.0,529808900.0,228.96,3779.75,1.38,637650.0
50%,75679550.0,646410.0,3915603000.0,529750000.0,529963700.0,547.2,5533.5,2.35,645790.0
75%,80770540.0,653240.0,4063275000.0,530104400.0,530520700.0,1934.6875,2096862.0,3.41,653240.0
max,87761450.0,661950.0,4577216000.0,530565100.0,530824300.0,13829.76,2278193.0,5.37,661460.0


## Robust LP Bid vs LP Bid

In [None]:
# """
# Exact implementation of noise generation based on Wang & Dong (2023)
# using formulas from the paper with entropy and logarithms

# From the paper Wang, X., & Dong, H. (2023):
# - Formula (15): H(y*|x*,D) = MI(y*;ω|x*,D) + E(H(y*|x*,ω))
# - Formula (17): Ĥ(y*|x*,D) = -∑_c (1/T ∑_t p(y=c|x*,ω̂_t) log(1/T ∑_t p(y=c|x*,ω̂_t)))
# - Formula (18): E(Ĥ(y*|x*,ω)) = -1/T ∑_{t,c} p(y=c|x*,ω̂_t) log p(y=c|x*,ω̂_t)
# - Formula (19): M̂I(y*;ω|x*,D) = Ĥ(y*|x*,D) - E(Ĥ(y*|x*,ω))
# """

# import numpy as np
# import pandas as pd
# from scipy import stats
# import matplotlib.pyplot as plt

# class WangDongCTRNoise:
#     """
#     CTR noise generator based on exact formulas from Wang & Dong (2023)
#     """

#     def __init__(self, random_state=1337):
#         np.random.seed(random_state)
#         self.random_state = random_state

#     def monte_carlo_dropout_predictions(self, ctr_predicted, T=50, dropout_rate=0.05):
#         """
#         Generate T predictions via Monte Carlo Dropout
#         Corresponds to the procedure from the paper for obtaining ω̂_t

#         Parameters:
#         -----------
#         ctr_predicted : array-like
#             Base CTR predictions (p(y=1|x*,ω))
#         T : int
#             Number of MC samples (as in the paper)
#         dropout_rate : float
#             Dropout rate (0.05 recommended based on experiments in the paper)

#         Returns:
#         --------
#         mc_predictions : np.array shape (T, N)
#             T predictions for each of N samples
#         """
#         n_samples = len(ctr_predicted)
#         ctr_clipped = np.clip(ctr_predicted, 1e-7, 1-1e-7)  # Avoid log(0)

#         mc_predictions = []

#         for t in range(T):
#             # Simulate dropout via logit modification
#             # This is an approximation to real MC Dropout from the paper
#             logits = np.log(ctr_clipped / (1 - ctr_clipped))

#             # Dropout mask - randomly zero out some "neurons"
#             dropout_mask = np.random.binomial(1, 1-dropout_rate, n_samples)

#             # Add noise proportional to uncertainty
#             # More noise where the model is less confident (mid-range CTR values)
#             uncertainty = 4 * ctr_clipped * (1 - ctr_clipped)  # Maximum at CTR=0.5
#             noise_std = dropout_rate * uncertainty

#             logit_noise = np.random.normal(0, noise_std)
#             noisy_logits = logits + logit_noise * dropout_mask

#             # Inverse transform back to probabilities
#             mc_ctr = 1 / (1 + np.exp(-noisy_logits))
#             mc_predictions.append(mc_ctr)

#         return np.array(mc_predictions)

#     def calculate_predictive_entropy(self, mc_predictions):
#         """
#         Calculate predictive entropy Ĥ(y*|x*,D)
#         Formula (17) from the paper

#         Parameters:
#         -----------
#         mc_predictions : np.array shape (T, N)
#             MC predictions

#         Returns:
#         --------
#         predictive_entropy : np.array shape (N,)
#             Predictive entropy for each sample
#         """
#         T, N = mc_predictions.shape

#         # Averaged predictions: 1/T ∑_t p(y=c|x*,ω̂_t)
#         avg_p1 = np.mean(mc_predictions, axis=0)  # P(y=1)
#         avg_p0 = 1 - avg_p1  # P(y=0)

#         # Avoid log(0)
#         avg_p1 = np.clip(avg_p1, 1e-7, 1-1e-7)
#         avg_p0 = np.clip(avg_p0, 1e-7, 1-1e-7)

#         # Formula (17): Ĥ(y*|x*,D) = -∑_c (avg_p_c * log(avg_p_c))
#         predictive_entropy = -(avg_p1 * np.log(avg_p1) + avg_p0 * np.log(avg_p0))

#         return predictive_entropy

#     def calculate_expected_entropy(self, mc_predictions):
#         """
#         Calculate expected entropy E(Ĥ(y*|x*,ω))
#         Formula (18) from the paper

#         Parameters:
#         -----------
#         mc_predictions : np.array shape (T, N)
#             MC predictions

#         Returns:
#         --------
#         expected_entropy : np.array shape (N,)
#             Expected entropy for each sample
#         """
#         T, N = mc_predictions.shape

#         # Avoid log(0)
#         mc_predictions_safe = np.clip(mc_predictions, 1e-7, 1-1e-7)
#         mc_predictions_0 = 1 - mc_predictions_safe

#         # Formula (18): E(Ĥ(y*|x*,ω)) = -1/T ∑_{t,c} p(y=c|x*,ω̂_t) log p(y=c|x*,ω̂_t)
#         entropy_per_sample = -(mc_predictions_safe * np.log(mc_predictions_safe) +
#                               mc_predictions_0 * np.log(mc_predictions_0))

#         expected_entropy = np.mean(entropy_per_sample, axis=0)

#         return expected_entropy

#     def calculate_mutual_information(self, predictive_entropy, expected_entropy):
#         """
#         Calculate mutual information M̂I(y*;ω|x*,D)
#         Formula (19) from the paper

#         Parameters:
#         -----------
#         predictive_entropy : np.array
#             Predictive entropy
#         expected_entropy : np.array
#             Expected entropy

#         Returns:
#         --------
#         mutual_information : np.array
#             Mutual information (epistemic uncertainty)
#         """
#         # Formula (19): M̂I(y*;ω|x*,D) = Ĥ(y*|x*,D) - E(Ĥ(y*|x*,ω))
#         mutual_information = predictive_entropy - expected_entropy

#         # MI cannot be negative (clamp from below)
#         mutual_information = np.maximum(mutual_information, 0)

#         return mutual_information

#     def generate_wang_dong_noise(self, ctr_predicted, T=50, dropout_rate=0.05,
#                                 noise_type='total', scale_factor=1.0):
#         """
#         Generate noise using the exact methodology of Wang & Dong (2023)

#         Parameters:
#         -----------
#         ctr_predicted : array-like
#             Predicted CTR values
#         T : int
#             Number of MC samples (from the paper: typically 50)
#         dropout_rate : float
#             Dropout rate (from the paper: typically 0.05)
#         noise_type : str
#             'epistemic' - epistemic uncertainty only (MI)
#             'aleatoric' - aleatoric uncertainty only (expected entropy)
#             'total' - total uncertainty (predictive entropy)
#         scale_factor : float
#             Scaling coefficient for noise

#         Returns:
#         --------
#         noise : np.array
#             Generated noise
#         uncertainty_info : dict
#             Detailed uncertainty information
#         """
#         # Step 1: Generate MC predictions (dropout simulation)
#         mc_predictions = self.monte_carlo_dropout_predictions(
#             ctr_predicted, T=T, dropout_rate=dropout_rate
#         )

#         # Step 2: Compute three types of uncertainty using formulas from the paper
#         predictive_entropy = self.calculate_predictive_entropy(mc_predictions)
#         expected_entropy = self.calculate_expected_entropy(mc_predictions)
#         mutual_information = self.calculate_mutual_information(
#             predictive_entropy, expected_entropy
#         )

#         # Step 3: Select the uncertainty type for noise generation
#         if noise_type == 'epistemic':
#             # Epistemic uncertainty = MI (model uncertainty)
#             uncertainty_values = mutual_information
#         elif noise_type == 'aleatoric':
#             # Aleatoric uncertainty = Expected entropy (data uncertainty)
#             uncertainty_values = expected_entropy
#         elif noise_type == 'total':
#             # Total uncertainty = Predictive entropy
#             uncertainty_values = predictive_entropy
#         else:
#             raise ValueError(f"Unknown noise_type: {noise_type}")

#         # Step 4: Generate noise with variance proportional to uncertainty
#         # The higher the uncertainty, the larger the potential noise
#         noise_std = scale_factor * np.sqrt(uncertainty_values)
#         noise = np.random.normal(0, noise_std)

#         # Information for analysis
#         uncertainty_info = {
#             'predictive_entropy': predictive_entropy,
#             'expected_entropy': expected_entropy,
#             'mutual_information': mutual_information,
#             'noise_std': noise_std,
#             'mc_predictions_std': np.std(mc_predictions, axis=0),
#             'method_params': {
#                 'T': T,
#                 'dropout_rate': dropout_rate,
#                 'noise_type': noise_type,
#                 'scale_factor': scale_factor
#             }
#         }

#         return noise, uncertainty_info

#     def suggest_scale_factor(self, ctr_predicted, target_noise_std=0.05):
#         """
#         Automatically select scale_factor to achieve the desired noise level

#         Parameters:
#         -----------
#         ctr_predicted : array-like
#             Predicted CTR values
#         target_noise_std : float
#             Desired standard deviation of the noise

#         Returns:
#         --------
#         optimal_scale_factor : float
#             Recommended scale_factor
#         """
#         # Test different scale_factor values
#         test_scales = np.logspace(-2, 1, 20)  # from 0.01 to 10

#         best_scale = 1.0
#         min_diff = float('inf')

#         for scale in test_scales:
#             noise, _ = self.generate_wang_dong_noise(
#                 ctr_predicted, scale_factor=scale, T=20  # Quick estimate
#             )
#             actual_std = np.std(noise)
#             diff = abs(actual_std - target_noise_std)

#             if diff < min_diff:
#                 min_diff = diff
#                 best_scale = scale

#         return best_scale

# # Main function for use
# def generate_wang_dong_2023_noise(ctr_values, target_noise_std=0.05,
#                                  noise_type='total', auto_scale=True):
#     """
#     Simple function for noise generation based on Wang & Dong (2023)

#     Parameters:
#     -----------
#     ctr_values : array-like
#         Your predicted CTR values
#     target_noise_std : float
#         Desired standard deviation of the noise
#     noise_type : str
#         'epistemic', 'aleatoric', or 'total'
#     auto_scale : bool
#         Automatically select scale_factor

#     Returns:
#     --------
#     noise : np.array
#         Generated noise
#     citation_info : str
#         Ready-made text for citation in the paper
#     """
#     generator = WangDongCTRNoise(random_state=42)

#     if auto_scale:
#         scale_factor = generator.suggest_scale_factor(ctr_values, target_noise_std)
#         # print(f"Automatically selected scale_factor: {scale_factor:.3f}")
#     else:
#         scale_factor = 1.0

#     noise, info = generator.generate_wang_dong_noise(
#         ctr_values,
#         noise_type=noise_type,
#         scale_factor=scale_factor,
#         T=50,  # As in the paper
#         dropout_rate=0.05  # Optimal value from the paper
#     )

#     return noise, _ # citation_info.strip()

# # Usage example
# if __name__ == "__main__":
#     # Test data - your CTR values
#     np.random.seed(42)
#     test_ctr = np.random.uniform(0.01, 0.3, 1000)

#     print("=== Noise generation based on Wang & Dong (2023) ===")
#     noise, citation = generate_wang_dong_2023_noise(
#         test_ctr,
#         target_noise_std=0.05,
#         noise_type='total'
#     )

#     print("\n" + "="*50)
#     print("TEXT FOR THE PAPER:")
#     print("="*50)
#     print(citation)

#     # Validation
#     generator = WangDongCTRNoise()
#     # info = generator.validate_methodology(test_ctr, plot=False)

In [7]:
def create_noised_stats_mse(stats_df, stats_save_path, old_ctr, eps, auction_mode, seed):
    random.seed(seed)
    np.random.seed(seed)

    grouped = stats_df.groupby('campaign_id')
    for campaign_id, group in grouped:
        old_ctr = group['CTRPredicts'].values

        noise, citation = generate_wang_dong_2023_noise(
            old_ctr,
            target_noise_std=eps,
            noise_type='total'
        )

        # noise = np.random.rand(old_ctr.size)
        # noise = noise / np.linalg.norm(noise) * np.sqrt(2*eps)
        stats_df.loc[stats_df.campaign_id == campaign_id, 'CTRPredicts_noised'] = np.clip(old_ctr + noise, 0.01, 0.1)

    stats_df.to_csv(stats_save_path)


In [15]:
eps_set_data = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 1e-12]
# eps_set_model = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 1e-12]

bidder_types = ['simple', 'robust']
seeds = list(range(10))

metrics_list = []

old_ctr = np.array(stats_df.CTRPredicts.copy())

stats_save_path = f"../data/data/{auction_mode.lower()}/stats_{auction_mode.lower()}_filtered_train_noised.csv"

for eps in tqdm(eps_set_data):
    # for factor in [0.1, 1., 10, 100]:
    # eps_data = eps_model # * factor
    for seed in seeds:
        # print(seed)
        # Add noise to data
        create_noised_stats_mse(stats_df, stats_save_path, old_ctr, eps, auction_mode, seed)

        cpc = 300.

        # Simple bid
        res_simple = autobidder_check(
            bidder=SimpleBid,
            params={"input_campaigns": campaigns_path,
                    "input_stats": stats_save_path,
                    'eps': eps,
                    'p': 1,
                    'q': 1,
                    'LP': True,
                    'CPC': cpc},
            loss_type=loss_type
        )
        metrics_list.append({
            'eps': eps,
            # 'eps_model': eps,
            'bidder_type': 'simple',
            'seed': seed,
            'tvc': res_simple['score'][0],
            'cpc_percent': res_simple['score'][1],
            'cpc_avg': res_simple['score'][2]
        })

        # Robust bid
        res_robust = autobidder_check(
            bidder=RobustBidMSE,
            params={"input_campaigns": campaigns_path,
                    "input_stats": stats_save_path,
                    'eps': eps,
                    'gamma': 1.,
                    'beta': 1.,
                    'lambda_': 1.,
                    'chi': 1.,
                    'theta': 1.,
                    'delta': 1.,
                    'kappa': 1.,
                    'LP': True,
                    'CPC': cpc},
            loss_type=loss_type
        )

        metrics_list.append({
            'eps': eps,
            # 'eps_model': eps_model,
            'bidder_type': 'robust',
            'seed': seed,
            'tvc': res_robust['score'][0],
            'cpc_percent': res_robust['score'][1],
            'cpc_avg': res_robust['score'][2]
        })

metrics_df = pd.DataFrame(metrics_list, columns=['eps', 'bidder_type', 'seed', 'tvc', 'cpc_percent', 'cpc_avg'])

  0%|          | 0/7 [00:00<?, ?it/s]

100%|██████████| 7/7 [15:47<00:00, 135.36s/it]


In [16]:
agg_metrics = metrics_df.groupby(['eps', 'bidder_type']).agg(
    mean_tvc=('tvc', 'mean'),
    std_tvc=('tvc', 'std'),
    mean_cpc_percent=('cpc_percent', 'mean'),
    std_cpc_percent=('cpc_percent', 'std'),
    mean_cpc_avg=('cpc_avg', 'mean'),
    std_cpc_avg=('cpc_avg', 'std')
).reset_index()

agg_metrics

Unnamed: 0,eps,bidder_type,mean_tvc,std_tvc,mean_cpc_percent,std_cpc_percent,mean_cpc_avg,std_cpc_avg
0,1e-12,robust,2.260368,0.0,0.0,0.0,468.064523,0.0
1,1e-12,simple,2.260151,0.0,0.0,0.0,468.108536,0.0
2,0.0005,robust,2.78704,0.0,0.0,0.0,331.573948,0.0
3,0.0005,simple,2.260151,0.0,0.0,0.0,468.108536,0.0
4,0.001,robust,3.072124,0.0,0.0,0.0,312.071788,0.0
5,0.001,simple,2.260151,0.0,0.0,0.0,468.108536,0.0
6,0.005,robust,3.407976,0.0,0.0,0.0,291.231927,0.0
7,0.005,simple,2.251925,0.0,0.0,0.0,469.767519,0.0
8,0.01,robust,3.546123,0.0,0.0,0.0,282.363859,0.0
9,0.01,simple,2.172647,0.0,0.0,0.0,474.291485,0.0


In [17]:
agg_metrics.to_csv(f'../results/metrics_{loss_type.lower()}_BAT_CTR.csv')

### TVC

In [None]:
mean_tvc_simple_ctr = np.array(agg_metrics[agg_metrics.bidder_type == 'simple'].mean_tvc)
mean_tvc_robust_ctr = np.array(agg_metrics[agg_metrics.bidder_type == 'robust'].mean_tvc)

# results from another notebook with normal noise
mean_tvc_simple_norm = np.array([2.26495932, 2.26782824, 2.19687101, 2.08576958, 2.05811226,
       1.93404816, 1.89193563])
mean_tvc_robust_norm = np.array([2.26483722, 2.79915243, 3.03667371, 3.34679585, 3.48146273,
       3.91136865, 4.0522365 ])

In [18]:
(mean_tvc_robust_ctr / mean_tvc_simple_ctr), (mean_tvc_robust_norm / mean_tvc_simple_norm)

(array([1.00009584, 1.23312083, 1.35925602, 1.51516363, 1.63840515,
        1.94609533, 1.93573035]),
 array([0.99994609, 1.23428767, 1.38227219, 1.60458561, 1.69158058,
        2.02237397, 2.14184692]))

In [17]:
(mean_tvc_robust_ctr / mean_tvc_simple_ctr).mean(), (mean_tvc_robust_norm / mean_tvc_simple_norm).mean()

(1.5182667368113836, 1.5824132896383467)

### CPC

In [None]:
mean_cpc_simple_ctr = np.array(agg_metrics[agg_metrics.bidder_type == 'simple'].mean_cpc_avg)
mean_cpc_robust_ctr = np.array(agg_metrics[agg_metrics.bidder_type == 'robust'].mean_cpc_avg)

# results from another notebook with normal noise
mean_cpc_simple_norm = np.array([515.13199505, 514.81091864, 518.49015618, 523.06691103,
       524.31452656, 525.98242848, 525.52466516])
mean_cpc_robust_norm = np.array([515.05495171, 361.7245655 , 339.32337875, 316.72874209,
       306.37696331, 273.53070906, 256.42070281])

In [21]:
(mean_cpc_robust_ctr / mean_cpc_simple_ctr), (mean_cpc_robust_norm / mean_cpc_simple_norm)

(array([0.99989679, 0.69479282, 0.65160455, 0.60698901, 0.57981732,
        0.49342042, 0.46011147]),
 array([0.99985044, 0.70263577, 0.65444517, 0.60552242, 0.58433812,
        0.52003773, 0.48793276]))

In [20]:
(mean_cpc_robust_ctr / mean_cpc_simple_ctr).mean(), (mean_cpc_robust_norm / mean_cpc_simple_norm).mean()

(0.6409474830510654, 0.6506803447485854)

In [None]:
plot_metric_with_error_CTR(
    eps_set=eps_set,
    agg_metrics=agg_metrics,
    metric_mean_col='mean_tvc',
    metric_std_col='std_tvc',
    metric_name='TVC',
    y_label='Total Value Clicks',
    loss_type=loss_type
)

plot_metric_with_error_CTR(
    eps_set=eps_set,
    agg_metrics=agg_metrics,
    metric_mean_col='mean_cpc_percent',
    metric_std_col='std_cpc_percent',
    metric_name='CPC Percent',
    y_label='Cost per Click (%)',
    loss_type=loss_type
)

plot_metric_with_error_CTR(
    eps_set=eps_set,
    agg_metrics=agg_metrics,
    metric_mean_col='mean_cpc_avg',
    metric_std_col='std_cpc_avg',
    metric_name='CPC Avg',
    y_label='Average Cost per Click',
    loss_type=loss_type
)
