In [1]:
import numpy as np
import pandas as pd
from econml.dml import CausalForestDML
import matplotlib.pyplot as plt
import os
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.base import BaseEstimator
from econml.sklearn_extensions.model_selection import GridSearchCVList
import time
import joblib
import multiprocessing
import pickle


import config
from utils import *


  def _pt_shuffle_rec(i, indexes, index_mask, partition_tree, M, pos):
  def delta_minimization_order(all_masks, max_swap_size=100, num_passes=2):
  def _reverse_window(order, start, length):
  def _reverse_window_score_gain(masks, order, start, length):
  def _mask_delta_score(m1, m2):
  def identity(x):
  def _identity_inverse(x):
  def logit(x):
  def _logit_inverse(x):
  def _build_fixed_single_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _build_fixed_multi_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _init_masks(cluster_matrix, M, indices_row_pos, indptr):
  def _rec_fill_masks(cluster_matrix, indices_row_pos, indptr, indices, M, ind):
  def _single_delta_mask(dind, masked_inputs, last_mask, data, x, noop_code):
  def _delta_masking(masks, x, curr_delta_inds, varying_rows_out,
  def _jit_build_partition_tree(xmin, xmax, ymi

rank 20 model loaded!
rank 40 model loaded!
rank 60 model loaded!
rank 80 model loaded!
rank 100 model loaded!


In [2]:
# create ranks_list
with open("..\\results\main_scenario\\ranks_list.pickle", "rb") as file:
    ranks_list = pickle.load(file)

ranks_list.pop(0)
ranks_list.pop(-1)



# import forests:
for rank in ranks_list:
    cf =  joblib.load(f'..\\results\\split 1\\CF - Rank {rank}.pkl')
    exec(f"cf_{rank}_s1 = cf")
    if rank % 20 == 0:
        print(f"rank {rank} model loaded!")


for rank in ranks_list:
    cf =  joblib.load(f'..\\results\\split 2\\CF - Rank {rank}.pkl')
    exec(f"cf_{rank}_s2 = cf")
    if rank % 20 == 0:
        print(f"rank {rank} model loaded!")

rank 20 model loaded!
rank 40 model loaded!
rank 60 model loaded!
rank 80 model loaded!
rank 100 model loaded!
rank 20 model loaded!
rank 40 model loaded!
rank 60 model loaded!
rank 80 model loaded!
rank 100 model loaded!


In [3]:

# For ignoring the warnings
from warnings import simplefilter 
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
pd.options.mode.chained_assignment = None


base_ad = 50
max_adv_rank = 100
max_visit_no = 100 # max number of page visits by each user

In [4]:


# read data files
data_main = pd.read_stata("..\\data\\Simulation Data - Last 2 Days.dta")
data_s1 = pd.read_stata("..\\data\\Simulation Data - Last 2 Days - split 1.dta")
data_s2 = pd.read_stata("..\\data\\Simulation Data - Last 2 Days - split 2.dta")


In [5]:
for data in [data_main, data_s1, data_s2]:
    # Chunk the data
    chunk_users_num = 100000
    n_chunks = int(data.global_token_new.max() / chunk_users_num) + 1
    data['chunk'] = ((data['global_token_new'] / chunk_users_num).astype(int) + 1)


In [6]:
# replace user_visit_no with user_visit_no
data_s1['user_visit_no'] = data_s1['user_visit_no_actual']
data_s2['user_visit_no'] = data_s2['user_visit_no_actual']

In [7]:
# create data chunks
data_frames = [data_main, data_s1, data_s2]
df_names = ["data_main", "data_s1", "data_s2"]

for i in range(len(data_frames)):
    # create data chunks: data_chunk_1, ...
    # list_name = f"{data}_chunks"
    working_df = data_frames[i]
    # exec(f"{data}_chunks = []")
    
    for chunk in range(1, n_chunks + 1):
        var_name = f"{df_names[i]}_chunk_{chunk}"
        globals()[var_name] = working_df[working_df['chunk'] == chunk]



In [9]:
data_main.columns[-100:]

Index(['r_74', 'r_75', 'r_76', 'r_77', 'r_78', 'r_79', 'r_80', 'r_81', 'r_82',
       'r_83', 'r_84', 'r_85', 'r_86', 'r_87', 'r_88', 'r_89', 'r_90', 'r_91',
       'r_92', 'r_93', 'r_94', 'r_95', 'r_96', 'r_97', 'r_98', 'r_99', 'r_100',
       'impression_timestamp', 'publisher_id', 'dsp_new', 'publisher_rank',
       'publisher_subject', 'advertiser_rank', 'is_clicked',
       'impression_repeat', 'previous_clicks', 'previous_clicks_all_ads',
       'impression_repeat_base_ad', 'previous_clicks_base_ad', 'total_visits',
       'event_no', 'visit_s1', 'visit_s2', 'visit_s3', 'visit_s4', 'visit_s5',
       'visit_s6', 'visit_s7', 'visit_s8', 'visit_s9', 'visit_s10',
       'visit_s11', 'visit_s12', 'visit_s13', 'visit_s14', 'visit_s15',
       'visit_s16', 'visit_s17', 'visit_s18', 'visit_s19', 'visit_s20',
       'visit_s21', 'visit_s22', 'visit_s23', 'visit_s24', 'visit_s25',
       'visit_s26', 'sub_1', 'sub_2', 'sub_3', 'sub_4', 'sub_5', 'sub_6',
       'sub_7', 'sub_8', 'sub_9', '

In [13]:
start_time = time.perf_counter()
def simulate(data_main, data_s1, data_s2):
    # file_name = f"data_chunk_{chunk}"
    # create empty columns in the dataframe to fill later
    create_chosen_ad_vars(data_main)
    create_chosen_ad_vars(data_s1)
    create_chosen_ad_vars(data_s2)
    
    # print(f"\n\n\n=======> Chunk #{chunk}")
    start_time = time.perf_counter()


    for i in range(1, 2): #range(1, max_visit_no + 1):

        start_time_1 = time.perf_counter()
        print(f"\n\n --->Repeat #{i}:")
        # 1) calculate treatment effects, and base ad ctr, then sum them sup and create ctrs for all ads
        # start_time = time.perf_counter()
        # a) calc TEs and CTRs on s1
        calc_tes(data=data_s1, user_visit_no=i, ranks_list=config.ranks_list)
        calc_base_ad_ctr(data=data_s1, user_visit_no=i)
        calc_ctrs(data=data_s1, user_visit_no=i)

        # b) calc TEs and CTRs on s2
        calc_tes(data=data_s2, user_visit_no=i, ranks_list=config.ranks_list)
        calc_base_ad_ctr(data=data_s2, user_visit_no=i)
        calc_ctrs(data=data_s2, user_visit_no=i)


        # 2) determine what ads are chosen
        # a. create empty columns in the dataframe to fill later
        start_time_2 = time.perf_counter()
        # find the optimal ads and save them and their corresponding ctr's in the dataframe
        # on s1
        create_chosen_ad_columns(data= data_s1, user_visit_no=i)
        finish_time_2 = time.perf_counter()

        # on s2
        create_chosen_ad_columns(data= data_s2, user_visit_no=i)
  
        # 3) Update repeats and clicks for the next impressions
        # start_time_1 = time.perf_counter()
        # update_repeats(data, user_visit_no=i)
        # update_clicks(data, user_visit_no=i)

        # finish_time_1 = time.perf_counter()

        # print(f"Repeat {i} finished in {finish_time_1 - start_time_1} seconds!")


    finish_time = time.perf_counter()
    print(f"All Repeats finished in {finish_time - start_time} seconds!")
    return data


In [14]:

simulate(data_main_chunk_1, data_s1_chunk_1, data_s2_chunk_1)




 --->Repeat #1:
finished calculating te's for rank 100 in 65.86257810000006 seconds
finished calculating te's for rank 100 in 74.58812409999996 seconds
All Repeats finished in 252.20644700000003 seconds!


Unnamed: 0,global_token_new,impression_timestamp,publisher_rank,user_visit_no_actual,c_1,c_2,c_3,c_4,c_5,c_6,...,chosen_ad_click_dummy_7,chosen_ad_click_dummy_8,chosen_ad_click_dummy_9,chosen_ad_click_dummy_10,chosen_ad_click_dummy_11,chosen_ad_click_dummy_12,chosen_ad_click_dummy_13,chosen_ad_click_dummy_14,chosen_ad_click_dummy_15,num_ads
0,3.0,2012-09-07 17:31:04,631,1,0,0,0,0,0,0,...,,,,,,,,,,
1,3.0,2012-09-07 17:31:23,631,2,0,0,0,0,0,0,...,,,,,,,,,,
2,6.0,2012-09-07 08:59:56,258,1,0,0,0,0,0,0,...,,,,,,,,,,
3,6.0,2012-09-07 10:48:09,102,8,0,0,0,0,0,0,...,,,,,,,,,,
4,6.0,2012-09-07 10:57:51,27,9,0,0,0,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2445552,1617553.0,2012-09-07 20:18:48,131,4,0,0,0,0,0,0,...,,,,,,,,,,
2445553,1617553.0,2012-09-08 14:54:59,170,6,0,0,0,0,0,0,...,,,,,,,,,,
2445554,1617554.0,2012-09-07 16:55:38,26,1,0,0,0,0,0,0,...,,,,,,,,,,
2445555,1617555.0,2012-09-08 10:39:36,358,1,0,0,0,0,0,0,...,,,,,,,,,,


In [154]:
def simulate_split_data(data_s, data_main, user_visit_no):
    """
    This function takes the chosen ads on data_s, and calculates their (actual) ctr's from the main data. 
    """

    for index, row in data_s[data_s['user_visit_no'] == user_visit_no].iterrows():
        user_id = row['global_token_new']
        visit_no = row['user_visit_no']
        # find the row in data_main with the same user_id and visit_no as row:
        data_main_index = data_main[(data_main['global_token_new'] == user_id) & (data_main['user_visit_no'] == visit_no)].index[0]
        data_main_row = data_main[(data_main['global_token_new'] == user_id) & (data_main['user_visit_no'] == visit_no)].iloc[0]
        last_var = f"chosen_ad_{config.max_ads_per_page}"
        data_main.loc[data_main_index, 'chosen_ad_1': last_var] = data_s.loc[index, 'chosen_ad_1': last_var]
        data_main.at[data_main_index, 'num_ads'] = data_s.at[index, 'num_ads']
        # print(data_main.loc[data_main_index, 'chosen_ad_1': last_var])
        # now calculate the y for the chosen ads
        # first calculate y0
        X = data_main.loc[data_main_index: data_main_index, ['previous_clicks_all_ads',
        'impression_repeat_base_ad', 'previous_clicks_base_ad', 'total_visits',
        'visit_s1', 'visit_s2', 'visit_s3', 'visit_s4', 'visit_s5', 'visit_s6',
        'visit_s7', 'visit_s8', 'visit_s9', 'visit_s10', 'visit_s11',
        'visit_s12', 'visit_s13', 'visit_s14', 'visit_s15', 'visit_s16',
        'visit_s17', 'visit_s18', 'visit_s19', 'visit_s20', 'visit_s21',
        'visit_s22', 'visit_s23', 'visit_s24', 'visit_s25', 'visit_s26',
        'sub_1', 'sub_2', 'sub_3', 'sub_4', 'sub_5', 'sub_6', 'sub_7', 'sub_8',
        'sub_9', 'sub_10', 'sub_11', 'sub_12', 'sub_13', 'sub_14', 'sub_15',
        'sub_16', 'sub_17', 'sub_18', 'sub_19', 'sub_20', 'sub_21', 'sub_22',
        'sub_23', 'sub_24', 'sub_25', 'sub_26', 'mobile']]

        
        base_ad_ctr_var = f"y_{base_ad}"
        base_ad_te_var = f"te_{base_ad}"

        data_main.loc[data_main_index, base_ad_ctr_var] = config.base_ad_y_model.predict(X)
        data_main.loc[data_main_index, base_ad_te_var] = 0

        # then, calculate TEs and CTRs for chosen ads
        for chosen_ad_no in range(1, int(row['num_ads']) + 1):
            var_name = f"chosen_ad_{chosen_ad_no}"
            chosen_ad = int(data_s.at[index, var_name])
            if chosen_ad != base_ad:
                te_var = f'te_{chosen_ad}'
                ctr_var = f'y_{chosen_ad}'


                # Construct X variable for the input to the causal forest
                X = construct_X(data=data_main.loc[data_main_index:data_main_index, :], user_visit_no=user_visit_no, ad_rank=chosen_ad)
                # estimate the TE for X and ad number {chosen_ad}
                exec(f"data_main.loc[data_main_index, te_var] = config.cf_{chosen_ad}.const_marginal_effect(X)")
                # calculate the ctr
                data_main.loc[data_main_index, ctr_var] = data_main.loc[data_main_index, te_var] + data_main.loc[data_main_index, base_ad_ctr_var]
                # copy the ctr to split dataset
                ctr_var_s = f'y_{chosen_ad}_actual'
                data_s.loc[index, ctr_var_s] = data_main.loc[data_main_index, ctr_var]

        #update repeats on main dataset
        update_repeats_on_main_data(data_main, data_main_index)

        # update clicks on both main and split datasets
        total_clicks_on_impression = 0
        for chosen_ad_no in range(1, int(row['num_ads']) + 1):
            var_name = f"chosen_ad_{chosen_ad_no}"
            chosen_ad = int(data_s.at[index, var_name])
            ctr_var = f'y_{chosen_ad}'
            col_name = f'c_{chosen_ad}' # the column name to be updated (if ad 5 is clicked on, c_5 will increase by 1 for all subsequent impressions)
            click_dummy_var =f'chosen_ad_click_dummy_{chosen_ad_no}'
            rand_click = np.random.rand()   # a random number simulating user's click. User will click if rand_click < y_{chosen_ad}
            data_main.at[data_main_index, click_dummy_var] = int(rand_click <= data_main.at[data_main_index, ctr_var])
            data_s.at[index, click_dummy_var] = data_main.at[data_main_index, click_dummy_var]
            total_clicks_on_impression += data_main.loc[data_main_index, click_dummy_var]
            
            # update click on main data
            data_main.loc[((data_main['global_token_new'] == data_main.at[data_main_index, 'global_token_new'])
                            & (data_main['user_visit_no'] > data_main.at[data_main_index, 'user_visit_no'])), col_name] = int(data_main.at[data_main_index, col_name] + data_main.at[data_main_index, click_dummy_var])

            # update click on split data
            data_s.loc[((data_s['global_token_new'] == data_s.at[index, 'global_token_new'])
                            & (data_s['user_visit_no'] > data_s.at[index, 'user_visit_no'])), col_name] = int(data_s.at[index, col_name] + data_s.at[index, click_dummy_var])


        # update total number of clicks on both datasets
        # a) main data
        data_main.loc[((data_main['global_token_new'] == data_main.at[data_main_index, 'global_token_new']) 
                        & (data_main['user_visit_no'] > data_main.at[data_main_index, 'user_visit_no'])), 'previous_clicks_all_ads'] = int(data_main.at[data_main_index, 'previous_clicks_all_ads'] + total_clicks_on_impression)

        # b) split data
        data_s.loc[((data_s['global_token_new'] == data_s.at[index, 'global_token_new']) 
                        & (data_s['user_visit_no'] > data_s.at[index, 'user_visit_no'])), 'previous_clicks_all_ads'] = int(data_s.at[index, 'previous_clicks_all_ads'] + total_clicks_on_impression)



    # update repeats on split dataset (Note that since I have a pre-written function for this, I am doing this update separately from other updates and outside the for loop for all rows)
    update_repeats(data_s, user_visit_no)






In [156]:

def update_repeats_on_main_data(data_main, data_main_index):
    """
    This function updates the number of previous impression on data after user visit number user_visit_no in the main dataset. This is used to update the actual number of repeats 
    For example, after a user visits a page for the first time, and observes optimal ads (say ads 2, 5, 10), the initial impressions for all subsequent visits of that user, the number of previous impressions on ads 2, 5, 10 increases by 1. 
    """
    for chosen_ad_no in range(1, int(data_main.at[data_main_index, 'num_ads']) + 1):
        var_name = f"chosen_ad_{chosen_ad_no}"
        chosen_ad = int(data_main.at[data_main_index, var_name])
        col_name = f'r_{chosen_ad}'
        data_main.loc[((data_main['global_token_new'] == (data_main.at[data_main_index, 'global_token_new'])) & (data_main['user_visit_no'] > data_main.at[data_main_index, 'user_visit_no'])), col_name] = data_main.at[data_main_index, col_name] + 1




In [157]:
simulate_split_data(data_s1_chunk_1, data_main_chunk_1, user_visit_no=1)

ValueError: cannot convert float NaN to integer

In [158]:
data_main_chunk_1.previous_clicks_all_ads

0         0
1         0
2         0
3         0
4         0
         ..
299622    0
299623    0
299624    0
299625    0
299626    0
Name: previous_clicks_all_ads, Length: 299627, dtype: int8

In [122]:
X = data_main.loc[0:0, ['impression_repeat', 'previous_clicks', 'previous_clicks_all_ads',
        'impression_repeat_base_ad', 'previous_clicks_base_ad', 'total_visits',
        'visit_s1', 'visit_s2', 'visit_s3', 'visit_s4', 'visit_s5', 'visit_s6',
        'visit_s7', 'visit_s8', 'visit_s9', 'visit_s10', 'visit_s11',
        'visit_s12', 'visit_s13', 'visit_s14', 'visit_s15', 'visit_s16',
        'visit_s17', 'visit_s18', 'visit_s19', 'visit_s20', 'visit_s21',
        'visit_s22', 'visit_s23', 'visit_s24', 'visit_s25', 'visit_s26',
        'sub_1', 'sub_2', 'sub_3', 'sub_4', 'sub_5', 'sub_6', 'sub_7', 'sub_8',
        'sub_9', 'sub_10', 'sub_11', 'sub_12', 'sub_13', 'sub_14', 'sub_15',
        'sub_16', 'sub_17', 'sub_18', 'sub_19', 'sub_20', 'sub_21', 'sub_22',
        'sub_23', 'sub_24', 'sub_25', 'sub_26', 'mobile']]

In [123]:
X

Unnamed: 0,impression_repeat,previous_clicks,previous_clicks_all_ads,impression_repeat_base_ad,previous_clicks_base_ad,total_visits,visit_s1,visit_s2,visit_s3,visit_s4,...,sub_18,sub_19,sub_20,sub_21,sub_22,sub_23,sub_24,sub_25,sub_26,mobile
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [109]:
base_ad_str = f"r_{base_ad}"
X['impression_repeat_base_ad'] = data_main.loc[data_main_index, base_ad_str] + 1  # +1 is because r_* shows previous impressions, but impression repeat is the number of repeats (including current one)

base_ad_str = f"c_{base_ad}"
X['previous_clicks_base_ad'] = data_main.loc[data_main_index, base_ad_str]

# b) construct each ad's initial clicks and repeats
str = f"r_{chosen_ad}"
X ['impression_repeat'] = data_main.loc[data_main_index, str] + 1  # +1 is because r_* shows previous impressions, but impression repeat is the number of repeats (including current one)
str = f"c_{chosen_ad}"
X['user_visit_no'] = data_main.loc[data_main_index, str]

NameError: name 'data_main_index' is not defined

In [121]:
config.cf_83.const_marginal_effect(X)

AssertionError: Dimension mis-match of X with fitted X