In [1]:
import numpy as np
import pandas as pd
from econml.dml import CausalForestDML
import matplotlib.pyplot as plt
import os
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.base import BaseEstimator
from econml.sklearn_extensions.model_selection import GridSearchCVList
import time
import joblib
import multiprocessing
import pickle




import config
from utils import *


n_processes = 8

criteria = config.my_criteria


# For ignoring the warnings
from warnings import simplefilter, filterwarnings
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
filterwarnings("ignore", category=UserWarning)
pd.options.mode.chained_assignment = None



base_ad = 50
max_adv_rank = 100
max_visit_no = 100 # max number of page visits by each user

split_no_1 = config.split_no_1
split_no_2 = config.split_no_2


# read data
data = pd.read_stata(f"..\\data\\Full Model\\Simulation Data - Full Model - Split {split_no_1} {split_no_2} - Subsample.dta")
vals_data = pd.read_stata(f"..\\data\\Full Model\\Advertiser Valuations.dta")


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


rank 20 model loaded!
rank 40 model loaded!
rank 60 model loaded!
rank 80 model loaded!
rank 100 model loaded!
rank 20 model loaded!
rank 40 model loaded!
rank 60 model loaded!
rank 80 model loaded!
rank 100 model loaded!
rank 20 model loaded!
rank 40 model loaded!
rank 60 model loaded!
rank 80 model loaded!
rank 100 model loaded!


In [2]:
data = pd.read_stata(f"..\\data\\Full Model\\Simulation Data - Full Model - Split {split_no_1} {split_no_2} - Subsample.dta")
data = data[data['global_token_new'] <= 200]

In [3]:

def simulate_duopoly(data, vals_data, criteria):
    # file_name = f"data_chunk_{chunk}"
    # create empty columns in the dataframe to fill later
    create_chosen_split_ad_vars(data)

    
    # print(f"\n\n\n=======> Chunk #{chunk}")
    start_time_2 = time.perf_counter()


    for i in range(1, max_visit_no + 1):

        start_time_1 = time.perf_counter()
        print(f"\n\n --->Repeat #{i}:")
        # 1) calculate treatment effects, and base ad ctr, then sum them sup and create ctrs for all ads
        start_time = time.perf_counter()
        # a) calc TEs and CTRs on s1
        calc_split_tes(data, split_no=config.split_no_1, user_visit_no=i, ranks_list=config.ranks_list)
        calc_base_ad_split_ctr(data, split_no=config.split_no_1, user_visit_no=i)
        calc_split_ctrs(data, vals_data, split_no=config.split_no_1, user_visit_no=i, ranks_list=config.ranks_list)

        # b) calc TEs and CTRs on s2
        calc_split_tes(data, split_no=config.split_no_2, user_visit_no=i, ranks_list=config.ranks_list)
        calc_base_ad_split_ctr(data,split_no=config.split_no_2, user_visit_no=i)
        calc_split_ctrs(data, vals_data, split_no=config.split_no_2, user_visit_no=i, ranks_list=config.ranks_list)

        finish_time = time.perf_counter()
        print(f"Step 1 of repeat {i} finished in {finish_time - start_time} seconds!")
        # 2) determine what ads are chosen
        start_time = time.perf_counter()
        # find the optimal ads and save them and their corresponding ctr's in the dataframe
        # on s1
        create_chosen_ad_columns_split(data, split_no=config.split_no_1, user_visit_no=i, criteria=criteria)
        

        # on s2
        create_chosen_ad_columns_split(data, split_no=config.split_no_2, user_visit_no=i, criteria=criteria)
        finish_time = time.perf_counter()
        print(f"Step 2 of repeat {i} finished in {finish_time - start_time} seconds!")

        # 3) Update repeats
        start_time = time.perf_counter()

        # update_repeats_on_main_and_split_sqrt_n(data, user_visit_no=i)
        update_repeats_on_main_and_split(data, split_no=config.split_no_1, user_visit_no=i)
        update_repeats_on_main_and_split(data, split_no=config.split_no_2, user_visit_no=i)
        finish_time = time.perf_counter()
        print(f"Step 3 of repeat {i} finished in {finish_time - start_time} seconds!")

        # 4) Update clicks
        start_time = time.perf_counter()

        update_clicks_on_main_and_split(data, split_no=config.split_no_1, user_visit_no=i)
        update_clicks_on_main_and_split(data, split_no=config.split_no_2, user_visit_no=i)
        # update_clicks_on_main_and_split_sqrt_n(data, user_visit_no=i)     

        finish_time = time.perf_counter()
        print(f"Step 4 of repeat {i} finished in {finish_time - start_time} seconds!")
        finish_time_1 = time.perf_counter()
        print(f"Repeat {i}  finished in  {finish_time_1 - start_time_1} seconds!")

    finish_time_2 = time.perf_counter()
    # data.to_stata(("..\\results\\Full Model\\Simulation Results\\Simluation Results - Split {split_no_1} {split_no_2}.dta"))
    print(f"All Repeats finished in {finish_time_2 - start_time_2} seconds!")
    return data


def simulate_and_save_chunk(chunk_data, chunk_id, criteria):
    
    chunk_data = simulate_duopoly(chunk_data, vals_data, criteria)
    # Create a unique filename for the chunk
    if criteria == "CTR":
        filename = f"..\\results\\Full Model\\Simulation Results\\Simluation Results - Split {split_no_1} {split_no_2} - chunk {chunk_id+1}.dta"
    if criteria == "revenue":
        filename = f"..\\results\\Full Model\\Simulation Results\\Simluation Results - Split {split_no_1} {split_no_2} Revenue Max - chunk {chunk_id+1}.dta"

    # Save the processed DataFrame to DTA
    chunk_data.to_stata(filename)



In [17]:


def find_optimal_split_ads(row, criteria):
    """
    This functions calculates optimal ads (based on highest treatment effects) to be shown to the impression in each row. based on the calculated treatment effects y_i s
    Inputs: 
        - row: the row of the dataframe that it is applied to
        it has to include indices y_cols and "ads_on_page" (determines how many ads to choose)
    
    Returns: 
        - chosen_ads: a list of ads to be shown
        - chosen_ad_ys: a list of the corresponding treatment effects
    """
    chosen_ad_ys_actual = []
    chosen_ad_revs_actual = []
    y_cols = row.filter(regex=r'^y_.*_s$', axis=0)
    rev_cols = row.filter(regex=r'^rev_.*_s', axis=0)

    # sort the values by the value of the criteria
    if criteria == "CTR":
        sorted_ads = row[y_cols.index].sort_values(ascending=False).index.to_list()
        l = min(row['ads_on_page'], config.max_ads_per_page)    # number of ads to be shown on each visit
        chosen_ads = sorted_ads[0 : l]
        chosen_ads = [int(element[2: -2]) for element in chosen_ads] # this will turn y_25_s into 25!

    if criteria == "revenue":
        sorted_ads = row[rev_cols.index].sort_values(ascending=False).index.to_list()
        l = min(row['ads_on_page'], config.max_ads_per_page)    # number of ads to be shown on each visit  
        chosen_ads = sorted_ads[0 : l]
        chosen_ads = [int(element[4: -2]) for element in chosen_ads] # this will turn rev_25_s into 25!


    # creates a list of chosen ad ranks
    chosen_ad_ys_split = y_cols.sort_values(ascending=False).values[0:l]
    chosen_ad_revs_split = rev_cols.sort_values(ascending=False).values[0:l]

    for chosen_ad in chosen_ads:
        y_var_name = f"y_{chosen_ad}"
        chosen_ad_ys_actual.append(row[y_var_name])

        rev_var_name = f"rev_{chosen_ad}"
        chosen_ad_revs_actual.append(row[rev_var_name])

    return chosen_ads, chosen_ad_ys_split, chosen_ad_revs_split, chosen_ad_ys_actual, chosen_ad_revs_actual



    
def create_chosen_ad_columns_split(data, split_no, user_visit_no, criteria): 
    """
    This function finds the optimal ads for the subsection of "data" for which user_visit_no == user_visit_no
    The chosen ads and their corresponding click rates are saved in 'chosen_ad_{ad}' and 'chosen_ad_y_{ad}'
    """
    # select treatment effect columns
    # te_cols = data.loc[0: 1, :].filter(regex="^te_", axis=1).columns
    # select ctr columns:
    # y_cols = data.loc[0: 1, :].filter(regex="^y_", axis=1).columns
    y_cols = data.columns[data.columns.str.match(r"^y_.*_s$")]

    for index, row in data[(data['user_visit_no'] == user_visit_no) & (data['split'] == split_no)].iterrows():
        
        chosen_ads, chosen_ad_ys_split, chosen_ad_revs_split, chosen_ad_ys_actual, chosen_ad_revs_actual = find_optimal_split_ads(row, criteria)
        chosen_ads = [int(element) for element in chosen_ads]
        l = len(chosen_ads)
        last_chosen_ad_name = f"chosen_ad_{l}"
        # last_chosen_ad_te_name = f"chosen_ad_te_{l}"
        last_chosen_ad_y_name_split = f"chosen_ad_y_{l}_s"
        last_chosen_ad_y_name_actual = f"chosen_ad_y_{l}"
        last_chosen_ad_rev_name_split = f"chosen_ad_rev_{l}_s"
        last_chosen_ad_rev_name_actual = f"chosen_ad_rev_{l}"
        data.loc[index, 'chosen_ad_1': last_chosen_ad_name] = chosen_ads
        data.loc[index, 'chosen_ad_y_1_s' : last_chosen_ad_y_name_split] = chosen_ad_ys_split
        data.loc[index, 'chosen_ad_rev_1_s' : last_chosen_ad_rev_name_split] = chosen_ad_revs_split
        data.loc[index, 'chosen_ad_y_1' : last_chosen_ad_y_name_actual] = chosen_ad_ys_actual
        data.loc[index, 'chosen_ad_rev_1' : last_chosen_ad_rev_name_actual] = chosen_ad_revs_actual
        data.at[index, 'num_ads'] = int(l)
        # if index % 10000 == 0:
        #     print(f"index {index} done!")





In [25]:
simulate_duopoly(data, vals_data, criteria)



 --->Repeat #1:
finished calculating te's for rank 101 in 38.38664640000002 seconds
finished calculating te's for rank 101 in 37.81934490000003 seconds
Step 1 of repeat 1 finished in 83.03002319999996 seconds!
y_50_s     0.005163
y_1_s      0.006136
y_2_s       0.00559
y_3_s      0.005154
y_4_s      0.005629
             ...   
y_96_s     0.004157
y_98_s     0.004973
y_99_s     0.006585
y_100_s    0.006478
y_101_s    0.005267
Name: 5, Length: 96, dtype: object
rev_1_s      0.015206
rev_2_s      0.009196
rev_3_s      0.010083
rev_4_s      0.027165
rev_5_s      0.009474
               ...   
rev_96_s     0.012547
rev_98_s     0.018905
rev_99_s     0.049308
rev_100_s    0.027148
rev_101_s    0.018059
Name: 5, Length: 95, dtype: object
['rev_45_s']
y_50_s          0.0
y_1_s      0.000947
y_2_s      0.000409
y_3_s      0.000334
y_4_s      0.000465
             ...   
y_96_s          0.0
y_98_s          0.0
y_99_s     0.001422
y_100_s    0.001315
y_101_s         0.0
Name: 9, Length: 96, dt

KeyboardInterrupt: 

In [5]:
# a) calc TEs and CTRs on s1
calc_split_tes(data, split_no=config.split_no_1, user_visit_no=1, ranks_list=config.ranks_list)
calc_base_ad_split_ctr(data, split_no=config.split_no_1, user_visit_no=1)
calc_split_ctrs(data, vals_data, split_no=config.split_no_1, user_visit_no=1, ranks_list=config.ranks_list)

# b) calc TEs and CTRs on s2
calc_split_tes(data, split_no=config.split_no_2, user_visit_no=1, ranks_list=config.ranks_list)
calc_base_ad_split_ctr(data,split_no=config.split_no_2, user_visit_no=1)
calc_split_ctrs(data, vals_data, split_no=config.split_no_2, user_visit_no=1, ranks_list=config.ranks_list)


finished calculating te's for rank 101 in 27.451940000000008 seconds
finished calculating te's for rank 101 in 36.130911999999995 seconds


In [24]:
create_chosen_ad_columns_split(data, split_no=config.split_no_1, user_visit_no=1, criteria=criteria)


y_50_s     0.005163
y_1_s      0.006136
y_2_s       0.00559
y_3_s      0.005154
y_4_s      0.005629
             ...   
y_96_s     0.004157
y_98_s     0.004973
y_99_s     0.006585
y_100_s    0.006478
y_101_s    0.005267
Name: 5, Length: 96, dtype: object
rev_1_s      0.015206
rev_2_s      0.009196
rev_3_s      0.010083
rev_4_s      0.027165
rev_5_s      0.009474
               ...   
rev_96_s     0.012547
rev_98_s     0.018905
rev_99_s     0.049308
rev_100_s    0.027148
rev_101_s    0.018059
Name: 5, Length: 95, dtype: object
['rev_45_s']
y_50_s          0.0
y_1_s      0.000947
y_2_s      0.000409
y_3_s      0.000334
y_4_s      0.000465
             ...   
y_96_s          0.0
y_98_s          0.0
y_99_s     0.001422
y_100_s    0.001315
y_101_s         0.0
Name: 9, Length: 96, dtype: object
rev_1_s      0.002345
rev_2_s      0.000673
rev_3_s      0.000653
rev_4_s      0.002246
rev_5_s           0.0
               ...   
rev_96_s          0.0
rev_98_s          0.0
rev_99_s     0.010649
re

In [22]:
# last_chosen_ad_name = 'chosen_ad_6'
# data.loc[1, 'chosen_ad_1'] 
data.columns

Index(['global_token_new', 'mobile', 'impression_timestamp',
       'publisher_rank_sub', 'ads_on_page', 'user_visit_no_s',
       'user_visit_no_actual_s', 'c_1_s', 'c_2_s', 'c_3_s',
       ...
       'rev_99_s', 'rev_99', 'y_100_s', 'y_100', 'rev_100_s', 'rev_100',
       'y_101_s', 'y_101', 'rev_101_s', 'rev_101'],
      dtype='object', length=1059)

In [13]:
# sorted_ads = row[rev_cols.index].sort_values(ascending=False).index.to_list()
# sorted_ads
row = data.loc[1, :]
row
y_cols = row.filter(regex=r'^y_.*_s$', axis=0)
y_cols.sort_values(ascending=False).values[0:15]

array([0.0096171472699594, 0.006198201050081908, 0.005509540414408121,
       0.0054343158862862374, 0.0051910768885340755, 0.005155984294210044,
       0.00511785798337319, 0.00497956864950309, 0.004924602773008071,
       0.004917545630148194, 0.004874013229196629, 0.004828559944670336,
       0.004826678477374857, 0.0048136091922848494, 0.004793434404420359],
      dtype=object)

In [25]:
sorted_ads[0:15]

rev_45_s    0.059453
rev_15_s    0.041642
rev_73_s    0.041533
rev_70_s    0.041426
rev_38_s    0.037379
rev_71_s    0.033906
rev_87_s    0.032053
rev_94_s    0.031895
rev_99_s    0.030231
rev_80_s    0.030004
rev_66_s    0.029622
rev_31_s    0.028221
rev_49_s    0.026664
rev_18_s    0.025394
rev_26_s    0.024244
Name: 1, dtype: object

In [None]:
sorted_ads = row[rev_cols.index].sort_values(ascending=False)

In [63]:
y_var_name_split = 'y_2_s'
rev_var_name_split = 'rev_2_s'

data.loc[(data['user_visit_no'] == 1) & (data['split'] == 1), rev_var_name_split] = data.loc[(data['user_visit_no'] == 1) & (data['split'] == 1), y_var_name_split] * vals_data.loc[vals_data['advertiser_rank'] == 2].advertiser_val_cents.squeeze()

In [62]:
# data['rev_2_s']
vals_data.loc[vals_data['advertiser_rank'] == 2].advertiser_val_cents.squeeze()

1.6451094