In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import pickle
from utils import *


# For ignoring the warnings
from warnings import simplefilter, filterwarnings
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
pd.options.mode.chained_assignment = None
filterwarnings("ignore", message="Loky-backed parallel loops cannot be called in a multiprocessing, setting n_jobs=1")



Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


rank 20 model loaded!
rank 40 model loaded!
rank 60 model loaded!
rank 80 model loaded!
rank 100 model loaded!
rank 20 model loaded!
rank 40 model loaded!
rank 60 model loaded!
rank 80 model loaded!
rank 100 model loaded!


In [6]:

def calc_tes_sub(data, user_visit_no, ranks_list):
    """
    This function calculates the treatment effects for the ads with ranks in "rank_list" for the subset of DataFrame "data" for which the "user_visit_no" is a specific number.
    The output is saved in columns te_1, ..., te_{max_adv_rank} of the dataframe "data"
    """
    start_time = time.perf_counter()
    for rank in ranks_list:
        X = construct_X(data, user_visit_no=user_visit_no, ad_rank=rank)
        var_name = f"te_{rank}_sub"
        exec(f"data.loc[data['user_visit_no'] == user_visit_no, 'temp'] = config.cf_{rank}_sub.const_marginal_effect(X[data['user_visit_no'] == user_visit_no])")
        data.loc[data['user_visit_no'] == user_visit_no, var_name] = data.loc[data['user_visit_no'] == user_visit_no, 'temp']
        var_name = f"te_{rank}"
        exec(f"data.loc[data['user_visit_no'] == user_visit_no, 'temp'] = config.cf_{rank}.const_marginal_effect(X[data['user_visit_no'] == user_visit_no])")
        data.loc[data['user_visit_no'] == user_visit_no, var_name] = data.loc[data['user_visit_no'] == user_visit_no, 'temp']
        # if rank % 10 == 1:
        #     print(f"rank {rank} done!")
    data = data.drop(['temp'], axis=1)
    finish_time = time.perf_counter()
    print(f"finished calculating te's for rank {rank} in {finish_time - start_time} seconds")



def calc_base_ad_ctr_sub(data, user_visit_no):
    """
    This function calculates E(y0|X=x) for the subset of DataFrame "data" for which the "user_visit_no" is a specific number.
    The output is saved in columns y_{base_ad} of the dataframe "data"
    """
    start_time = time.perf_counter()
    y_0 = calc_base_ad_ctr_vector(data, user_visit_no)
    var_name = f"y_{base_ad}_sub"
    
    data.loc[data['user_visit_no'] == user_visit_no, var_name] = np.maximum(y_0, 0)
    var_name = f"y_{base_ad}"
    data.loc[data['user_visit_no'] == user_visit_no, var_name] = np.maximum(y_0, 0)



def calc_ctrs_sub(data, vals_data, user_visit_no):
    """
    This function calculates the click rates of all ads for the subset of DataFrame "data" for which the "user_visit_no" is a specific number by adding y_{base_ad} and treatment effects.
    The output is saved in columns y_1, ..., y_{max_adv_rank} of the dataframe "data"
    """
    start_time = time.perf_counter()
    for rank in config.ranks_list:
        y_var_name = f'y_{rank}_sub'
        te_var_name = f'te_{rank}_sub'
        y_base_ad = f'y_{base_ad}_sub'
        rev_var_name = f'rev_{rank}_sub'
        data.loc[data['user_visit_no'] == user_visit_no, y_var_name] = data.loc[data['user_visit_no'] == user_visit_no, te_var_name] + data.loc[data['user_visit_no'] == user_visit_no, y_base_ad]
        # set y_{rank} to 0 if it is negative
        data.loc[data['user_visit_no'] == user_visit_no, y_var_name] = data.loc[data['user_visit_no'] == user_visit_no, y_var_name].apply(lambda x: max(x, 0))
        # revenue = ctr * valuation 
        data.loc[data['user_visit_no'] == user_visit_no, rev_var_name] = data.loc[data['user_visit_no'] == user_visit_no, y_var_name] * vals_data.loc[vals_data['advertiser_rank'] == rank].advertiser_val_cents


        y_var_name = f'y_{rank}'
        te_var_name = f'te_{rank}'
        y_base_ad = f'y_{base_ad}'
        rev_var_name = f'rev_{rank}'
        data.loc[data['user_visit_no'] == user_visit_no, y_var_name] = data.loc[data['user_visit_no'] == user_visit_no, te_var_name] + data.loc[data['user_visit_no'] == user_visit_no, y_base_ad]
        # set y_{rank} to 0 if it is negative
        data.loc[data['user_visit_no'] == user_visit_no, y_var_name] = data.loc[data['user_visit_no'] == user_visit_no, y_var_name].apply(lambda x: max(x, 0))
        # revenue = ctr * valuation 
        data.loc[data['user_visit_no'] == user_visit_no, rev_var_name] = data.loc[data['user_visit_no'] == user_visit_no, y_var_name] * vals_data.loc[vals_data['advertiser_rank'] == rank].advertiser_val_cents



    finish_time = time.perf_counter()
    print(f"finished calculating y_i's in {finish_time - start_time} seconds")


def create_chosen_ad_vars(data):
    """
    This functions initializes three sets of variable in the dataframe "data":
    1) chosen_ad_{ad}: shows the rank of the the top {ad} chosen ad, ex: chosen_ad_1 is the rank of the top ad chosen to be shown
    2)chosen_ad_y_{ad}: shows the corresponding treatment effect of that ad
    Initially, all these columns are NaN
    3) num_ads:  number of ads to be shown (currently nan)

    Inputs:
    - data: the dataframe

    """
    for ad in range(1, 16):
        var_name1 = f"chosen_ad_{ad}"
        data.loc[:, var_name1] = np.nan


    for ad in range(1, 16):
        var_name2 = f"chosen_ad_y_{ad}_sub"
        data.loc[:, var_name2] = np.nan


    for ad in range(1, 16):
        var_name2 = f"chosen_ad_rev_{ad}_sub"
        data.loc[:, var_name2] = np.nan


    for ad in range(1, 16):
        var_name2 = f"chosen_ad_click_dummy_{ad}_sub"
        data.loc[:, var_name2] = np.nan
    data.loc[:, 'num_ads'] = np.nan


    for ad in range(1, config.max_ads_per_page + 1):
        var_name = f"chosen_ad_y_{ad}" 
        data.loc[:, var_name] = np.nan

    for ad in range(1, config.max_ads_per_page + 1):
        var_name = f"chosen_ad_te_{ad}" 
        data.loc[:, var_name] = np.nan

    for ad in range(1, config.max_ads_per_page + 1):
        var_name = f"chosen_ad_rev_{ad}" 
        data.loc[:, var_name] = np.nan

  



def find_optimal_ads(row, criteria):
    """
    This functions calculates optimal ads (based on highest treatment effects) to be shown to the impression in each row. based on the calculated treatment effects y_i s
    Inputs: 
        - row: the row of the dataframe that it is applied to
        it has to include indices y_cols and "ads_on_page" (determines how many ads to choose)
    
    Returns: 
        - chosen_ads: a list of ads to be shown
        - chosen_ad_ys: a list of the corresponding treatment effects
    """
    # y_cols = data.loc[0: 1, :].filter(regex="^y_", axis=1).columns
    # rev_cols = data.loc[0: 1, :].filter(regex="^rev_", axis=1).columns

    chosen_ad_ys_actual = []
    chosen_ad_revs_actual = []

    y_cols = row.filter(regex="^y_", axis=0).index
    rev_cols = row.filter(regex="^rev_", axis=0).index


    # sort the values by the value of the criteria
    if criteria == "CTR":
        sorted_ads = row[y_cols].sort_values(ascending=False).index.to_list()
        l = min(row['ads_on_page'], 15)    # number of ads to be shown on each visit
        chosen_ads = sorted_ads[0 : l]
        chosen_ads = [int(element.strip("y_sub")) for element in chosen_ads]

    if criteria == "revenue":
        sorted_ads = row[rev_cols].sort_values(ascending=False).index.to_list() 
        l = min(row['ads_on_page'], 15)    # number of ads to be shown on each visit  
        chosen_ads = sorted_ads[0 : l]
        chosen_ads = [int(element.strip("rev_sub")) for element in chosen_ads]

    for chosen_ad in chosen_ads:
        y_var_name = f"y_{chosen_ad}"
        chosen_ad_ys_actual.append(row[y_var_name])

        rev_var_name = f"rev_{chosen_ad}"
        chosen_ad_revs_actual.append(row[rev_var_name])
    # creates a list of chosen ad ranks

    chosen_ad_ys = row[y_cols].sort_values(ascending=False).values[0:l]
    chosen_ad_revs = row[rev_cols].sort_values(ascending=False).values[0:l]
    return chosen_ads, chosen_ad_ys, chosen_ad_revs, chosen_ad_ys_actual, chosen_ad_revs_actual



def create_chosen_ad_columns(data, user_visit_no, criteria):
    """
    This function finds the optimal ads for the subsection of "data" for which user_visit_no == user_visit_no
    The chosen ads and their corresponding click rates are saved in 'chosen_ad_{ad}' and 'chosen_ad_y_{ad}'
    """
    # select treatment effect columns
    # te_cols = data.loc[0: 1, :].filter(regex="^te_", axis=1).columns
    # select ctr columns:


    for index, row in data[data['user_visit_no'] == user_visit_no].iterrows():
        
        chosen_ads, chosen_ad_ys, chosen_ad_revs, chosen_ad_ys_actual, chosen_ad_revs_actual = find_optimal_ads(row, criteria)
        chosen_ads = [int(element) for element in chosen_ads]
        l = len(chosen_ads)
        last_chosen_ad_name = f"chosen_ad_{l}"
        # last_chosen_ad_te_name = f"chosen_ad_te_{l}"
        last_chosen_ad_y_name = f"chosen_ad_y_{l}_sub"
        last_chosen_ad_rev_name = f"chosen_ad_rev_{l}_sub"
        data.loc[index, 'chosen_ad_1': last_chosen_ad_name] = chosen_ads
        data.loc[index, 'chosen_ad_y_1_sub' : last_chosen_ad_y_name] = chosen_ad_ys
        data.loc[index, 'chosen_ad_rev_1_sub' : last_chosen_ad_rev_name] = chosen_ad_revs
        data.at[index, 'num_ads'] = int(l)
        last_chosen_ad_y_name = f"chosen_ad_y_{l}"
        last_chosen_ad_rev_name = f"chosen_ad_rev_{l}"
        data.loc[index, 'chosen_ad_y_1' : last_chosen_ad_y_name] = chosen_ad_ys_actual
        data.loc[index, 'chosen_ad_rev_1' : last_chosen_ad_rev_name] = chosen_ad_revs_actual
        # if index % 10000 == 0:
        #     print(f"index {index} done!")



In [3]:
# Load The Data
data = pd.read_stata("..\\data\\Full Model\\Simulation Data - Full Model - Monopoly - Subsample.dta")
data = data[data['user_visit_no'] == 1]
vals_data = pd.read_stata(f"..\\data\\Full Model\\Advertiser Valuations.dta")



In [7]:

def simulate_monopoly_sub(data, vals_data, criteria, subsampling_ratio):
    
    
        

    # file_name = f"data_chunk_{chunk}"
    # create empty columns in the dataframe to fill later
    create_chosen_ad_vars(data)
    # print(f"\n\n\n=======> Chunk #{chunk}")
    start_time = time.perf_counter()

    print("Calculating TEs Started!")
    calc_tes_sub(data, user_visit_no=1, ranks_list=config.ranks_list)
    finish_time = time.perf_counter()
    print(f"TEs calcualted in {finish_time - start_time} seconds!")
    
    calc_base_ad_ctr_sub(data, user_visit_no=1)
    finish_time_1 = time.perf_counter()
    print(f"Base Ad CTR calcualted in {finish_time_1 - finish_time} seconds!")    

    calc_ctrs_sub(data, vals_data, user_visit_no=1)
    finish_time_2 = time.perf_counter()
    print(f"CTRs calcualted in {finish_time_2 - finish_time_1} seconds!")  

    # 2) determine what ads are chosen
    # find the optimal ads and save them and their corresponding ctr's in the dataframe
    create_chosen_ad_columns(data, user_visit_no=1, criteria=criteria)
    finish_time_3 = time.perf_counter()
    print(f"Choosing Ads Finished in {finish_time_3 - finish_time_2} seconds!") 


    finish_time = time.perf_counter()
    print(f"All Repeats finished in {finish_time - start_time} seconds!")
    filename = f"..\\results\\Full Model\\Simulation Results\\Simluation Results - SQRT N Sub {int(subsampling_ratio * 100)}.dta"
    data.to_stata(filename)
    return data

In [8]:

simulate_monopoly_sub(data, vals_data, criteria=config.my_criteria, subsampling_ratio=config.subsampling_ratio)


Calculating TEs Started!
finished calculating te's for rank 101 in 741.3126891000002 seconds
TEs calcualted in 741.3694718000002 seconds!
finished calculating base ad ctr in 9.146428000000014 seconds
Base Ad CTR calcualted in 9.154519600000185 seconds!
finished calculating y_i's in 29.130377299999964 seconds
CTRs calcualted in 29.130573499999628 seconds!
Choosing Ads Finished in 1055.7877404000005 seconds!
All Repeats finished in 1835.4426026000006 seconds!


Unnamed: 0,global_token_new,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,...,y_99,rev_99,y_100_sub,rev_100_sub,y_100,rev_100,y_101_sub,rev_101_sub,y_101,rev_101
0,12.0,0,0,0,0,0,0,0,0,0,...,0.004700,,0.004603,,0.003774,,0.003272,,0.003763,
2,19.0,0,0,0,0,0,0,0,0,0,...,0.010352,,0.010255,,0.009426,,0.009261,,0.009513,
5,21.0,0,0,0,0,0,0,0,0,0,...,0.005990,,0.005893,,0.005064,,0.003734,,0.005068,
7,38.0,0,0,0,0,0,0,0,0,0,...,0.000827,,0.000730,,0.000000,,0.000000,,0.000000,
10,43.0,0,0,0,0,0,0,0,0,0,...,0.002513,,0.002415,,0.001586,,0.001026,,0.001584,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965255,1617510.0,0,0,0,0,0,0,0,0,0,...,0.005674,,0.005576,,0.004748,,0.002553,,0.005183,
965256,1617512.0,0,0,0,0,0,0,0,0,0,...,0.002410,,0.002313,,0.001484,,0.000758,,0.001392,
965258,1617513.0,0,0,0,0,0,0,0,0,0,...,0.001177,,0.001079,,0.000251,,0.000000,,0.000147,
965259,1617546.0,0,0,0,0,0,0,0,0,0,...,0.003769,,0.003671,,0.002843,,0.002197,,0.002900,
