In [1]:
import numpy as np
import pandas as pd
from econml.dml import CausalForestDML
import matplotlib.pyplot as plt
import os
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.base import BaseEstimator
from econml.sklearn_extensions.model_selection import GridSearchCVList
import time
import joblib
import multiprocessing
import pickle


import config
from utils import *


  def _pt_shuffle_rec(i, indexes, index_mask, partition_tree, M, pos):
  def delta_minimization_order(all_masks, max_swap_size=100, num_passes=2):
  def _reverse_window(order, start, length):
  def _reverse_window_score_gain(masks, order, start, length):
  def _mask_delta_score(m1, m2):
  def identity(x):
  def _identity_inverse(x):
  def logit(x):
  def _logit_inverse(x):
  def _build_fixed_single_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _build_fixed_multi_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _init_masks(cluster_matrix, M, indices_row_pos, indptr):
  def _rec_fill_masks(cluster_matrix, indices_row_pos, indptr, indices, M, ind):
  def _single_delta_mask(dind, masked_inputs, last_mask, data, x, noop_code):
  def _delta_masking(masks, x, curr_delta_inds, varying_rows_out,
  def _jit_build_partition_tree(xmin, xmax, ymi

rank 20 model loaded!
rank 40 model loaded!
rank 60 model loaded!
rank 80 model loaded!
rank 100 model loaded!
rank 20 model loaded!
rank 40 model loaded!
rank 60 model loaded!
rank 80 model loaded!
rank 100 model loaded!
rank 20 model loaded!
rank 40 model loaded!
rank 60 model loaded!
rank 80 model loaded!
rank 100 model loaded!


In [3]:

# For ignoring the warnings
from warnings import simplefilter 
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
pd.options.mode.chained_assignment = None


base_ad = 50
max_adv_rank = 100
max_visit_no = 100 # max number of page visits by each user

In [4]:


# read data files
data = pd.read_stata("..\\data\\Simulation Data - Last 2 Days - Aggregate.dta")


In [5]:
# Chunk the data
chunk_users_num = 100000
n_chunks = int(data.global_token_new.max() / chunk_users_num) + 1
data['chunk'] = ((data['global_token_new'] / chunk_users_num).astype(int) + 1)

for chunk in range(1, n_chunks + 1):
        var_name = f"data_chunk_{chunk}"
        globals()[var_name] = data[data['chunk'] == chunk]

In [None]:

# def create_chosen_ad_vars_duopoly(data):
#     """
#     This functions initializes three sets of variable in the dataframe "data":
#     1) chosen_ad_{ad}: shows the rank of the the top {ad} chosen ad, ex: chosen_ad_1 is the rank of the top ad chosen to be shown
#     2)chosen_ad_y_{ad}: shows the corresponding treatment effect of that ad
#     Initially, all these columns are NaN
#     3) num_ads:  number of ads to be shown (currently nan)

#     Inputs:
#     - data: the dataframe

#     """
#     for ad in range(1, config.max_ads_per_page + 1):
#         var_name1 = f"chosen_ad_{ad}"
#         data.loc[:, var_name1] = np.nan


#     for ad in range(1, config.max_ads_per_page + 1):
#         var_name2 = f"chosen_ad_y_estimated_{ad}"
#         data.loc[:, var_name2] = np.nan

#     for ad in range(1, config.max_ads_per_page + 1):
#         var_name2 = f"chosen_ad_y_actual_{ad}"
#         data.loc[:, var_name2] = np.nan

#     for ad in range(1, config.max_ads_per_page + 1):
#         var_name2 = f"chosen_ad_click_dummy_{ad}"
#         data.loc[:, var_name2] = np.nan
#     data.loc[:, 'num_ads'] = np.nan


In [28]:

def construct_split_X(data, split_no, user_visit_no, ad_rank):
    """ 
    This function updates the inputs for estimation so the estimates are for all user visits with a specific user_visit_no, and a specific ad_rank.
    After calling this function, you can estimate the treatment effect for ad ad_rank and the subset of data for which user_visit_no = user_visit_no.
    """
    # Define X variables
    X = data.loc[data['split'] ==split_no, ['impression_repeat_s', 'previous_clicks_s', 'previous_clicks_all_ads_s',
        'impression_repeat_base_ad_s', 'previous_clicks_base_ad_s', 'total_visits_s',
        'visit_s1_s', 'visit_s2_s', 'visit_s3_s', 'visit_s4_s', 'visit_s5_s', 'visit_s6_s',
        'visit_s7_s', 'visit_s8_s', 'visit_s9_s', 'visit_s10_s', 'visit_s11_s',
        'visit_s12_s', 'visit_s13_s', 'visit_s14_s', 'visit_s15_s', 'visit_s16_s',
        'visit_s17_s', 'visit_s18_s', 'visit_s19_s', 'visit_s20_s', 'visit_s21_s',
        'visit_s22_s', 'visit_s23_s', 'visit_s24_s', 'visit_s25_s', 'visit_s26_s',
        'sub_1_s', 'sub_2_s', 'sub_3_s', 'sub_4_s', 'sub_5_s', 'sub_6_s', 'sub_7_s', 'sub_8_s',
        'sub_9_s', 'sub_10_s', 'sub_11_s', 'sub_12_s', 'sub_13_s', 'sub_14_s', 'sub_15_s',
        'sub_16_s', 'sub_17_s', 'sub_18_s', 'sub_19_s', 'sub_20_s', 'sub_21_s', 'sub_22_s',
        'sub_23_s', 'sub_24_s', 'sub_25_s', 'sub_26_s', 'mobile_s']]
    # remove "_s" from column names in X to be able to run the causal forest model
    X.columns = X.columns.str[:-2]

    #################################### this is for fixing the missing variable sub_24, sub_25, sub_26 on the second split when training the model. Remove this when you fix this problem:
    if split_no == 2:
        X = X.drop(['sub_24', 'sub_25', 'sub_26'], axis=1)

    # Construct X variable for the input to the causal forest
    # a) construct base ad initial clicks and repeats

    base_ad_str = f"r_{base_ad}_s"
    X.loc[(data['user_visit_no_s'] == user_visit_no) & (data['split'] == split_no), 'impression_repeat_base_ad'] = data[(data['user_visit_no_s'] == user_visit_no)  & (data['split'] == split_no)][base_ad_str] + 1  # +1 is because r_* shows previous impressions, but impression repeat is the number of repeats (including current one)

    base_ad_str = f"c_{base_ad}_s"
    X.loc[(data['user_visit_no_s'] == user_visit_no)  & (data['split'] == split_no), 'previous_clicks_base_ad'] = data[(data['user_visit_no_s'] == user_visit_no)  & (data['split'] == split_no)][base_ad_str]

# b) construct each ad's initial clicks and repeats
    str = f"r_{ad_rank}_s"
    X.loc[(data['user_visit_no_s'] == user_visit_no) & (data['split'] == split_no), 'impression_repeat'] = data[(data['user_visit_no_s'] == user_visit_no)  & (data['split'] == split_no)][str] + 1  # +1 is because r_* shows previous impressions, but impression repeat is the number of repeats (including current one)
    str = f"c_{ad_rank}_s"
    X.loc[(data['user_visit_no_s'] == user_visit_no) & (data['split'] == split_no), 'previous_clicks'] = data[(data['user_visit_no_s'] == user_visit_no)  & (data['split'] == split_no)][str]
    return X



def calc_split_tes(data, split_no, user_visit_no, ranks_list):
    """
    This function calculates the treatment effects for the ads with ranks in "rank_list" for the subset of DataFrame "data" for which the "user_visit_no" is a specific number.
    The output is saved in columns te_1, ..., te_{max_adv_rank} of the dataframe "data"
    """
    start_time = time.perf_counter()
    for rank in ranks_list:
        X = construct_split_X(data, split_no, user_visit_no=user_visit_no, ad_rank=rank)
        var_name = f"te_{rank}_s"
        if (len(data[(data['user_visit_no'] == user_visit_no) & (data['split'] == split_no)]) > 0):
            exec(f"data.loc[((data['user_visit_no_s'] == user_visit_no) & (data['split'] == split_no)) , var_name] = config.cf_{rank}_s{split_no}.const_marginal_effect(X.loc[(data['user_visit_no_s'] == user_visit_no) & (data['split'] == split_no)])")
        # if rank % 10 == 1:
        #     print(f"rank {rank} done!")
    finish_time = time.perf_counter()
    print(f"finished calculating te's for rank {rank} in {finish_time - start_time} seconds")


def calc_base_ad_split_ctr(data, split_no, user_visit_no):
    """
    This function calculates E(y0|X=x) for the subset of DataFrame "data" for which the "user_visit_no" is a specific number.
    The output is saved in columns y_{base_ad} of the dataframe "data"
    """
    start_time = time.perf_counter()
    # Define X variables (Note that I am not using previous_clicks and i mpression_repeat variables here, because I'm only using base ad repeats and clicks here)
    X = data[['previous_clicks_all_ads_s',
        'impression_repeat_base_ad_s', 'previous_clicks_base_ad_s', 'total_visits_s',
        'visit_s1_s', 'visit_s2_s', 'visit_s3_s', 'visit_s4_s', 'visit_s5_s', 'visit_s6_s',
        'visit_s7_s', 'visit_s8_s', 'visit_s9_s', 'visit_s10_s', 'visit_s11_s',
        'visit_s12_s', 'visit_s13_s', 'visit_s14_s', 'visit_s15_s', 'visit_s16_s',
        'visit_s17_s', 'visit_s18_s', 'visit_s19_s', 'visit_s20_s', 'visit_s21_s',
        'visit_s22_s', 'visit_s23_s', 'visit_s24_s', 'visit_s25_s', 'visit_s26_s',
        'sub_1_s', 'sub_2_s', 'sub_3_s', 'sub_4_s', 'sub_5_s', 'sub_6_s', 'sub_7_s', 'sub_8_s',
        'sub_9_s', 'sub_10_s', 'sub_11_s', 'sub_12_s', 'sub_13_s', 'sub_14_s', 'sub_15_s',
        'sub_16_s', 'sub_17_s', 'sub_18_s', 'sub_19_s', 'sub_20_s', 'sub_21_s', 'sub_22_s',
        'sub_23_s', 'sub_24_s', 'sub_25_s', 'sub_26_s', 'mobile_s']]
    


    # remove "_s" from column names in X to be able to run the causal forest model
    X.columns = X.columns.str[:-2]

    # #################################### this is for fixing the missing variable sub_24, sub_25, sub_26 on the second split when training the model. Remove this when you fix this problem:
    # if (split_no == 2):
    #     X = X.drop(['sub_24', 'sub_25', 'sub_26'], axis=1) 

    var_name = f"y_{base_ad}_s"
    if (len(data[(data['user_visit_no'] == user_visit_no) & (data['split'] == split_no)]) > 0):
        data.loc[(data['user_visit_no_s'] == user_visit_no) & (data['split'] == split_no), var_name] = config.base_ad_y_model.predict(X.loc[(data['user_visit_no_s'] == user_visit_no) & (data['split'] == split_no)])
    finish_time = time.perf_counter()
    # print(f"finished calculating y0 in {finish_time - start_time} seconds")


def calc_split_ctrs(data, split_no, user_visit_no, ranks_list):
    """
    This function calculates the click rates of all ads for the subset of DataFrame "data" for which the "user_visit_no" is a specific number by adding y_{base_ad} and treatment effects.
    The output is saved in columns y_1, ..., y_{max_adv_rank} of the dataframe "data"
    """
    start_time = time.perf_counter()
    for rank in ranks_list:
        y_var_name = f'y_{rank}_s'
        te_var_name = f'te_{rank}_s'
        y_base_ad = f'y_{base_ad}_s'
        if (len(data[(data['user_visit_no'] == user_visit_no) & (data['split'] == split_no)]) > 0):
            data.loc[(data['user_visit_no'] == user_visit_no) & (data['split'] == split_no), y_var_name] = data.loc[(data['user_visit_no'] == user_visit_no) & (data['split'] == split_no), te_var_name] + data.loc[(data['user_visit_no'] == user_visit_no) & (data['split'] == split_no), y_base_ad]
        # set y_{rank} to 0 if it is negative
            data.loc[(data['user_visit_no'] == user_visit_no) & (data['split'] == split_no), y_var_name] = data.loc[data['user_visit_no'] == user_visit_no, y_var_name].apply(lambda x: max(x, 0))
    finish_time = time.perf_counter()
    # print(f"finished calculating y_i's in {finish_time - start_time} seconds")





def create_chosen_split_ad_vars(data):
    """
    This functions initializes three sets of variable in the dataframe "data":
    1) chosen_ad_{ad}: shows the rank of the the top {ad} chosen ad, ex: chosen_ad_1 is the rank of the top ad chosen to be shown
    2)chosen_ad_y_{ad}: shows the corresponding treatment effect of that ad
    Initially, all these columns are NaN
    3) num_ads:  number of ads to be shown (currently nan)

    Inputs:
    - data: the dataframe

    """
    for ad in range(1, config.max_ads_per_page + 1):
        var_name = f"chosen_ad_{ad}"
        data.loc[:, var_name] = np.nan


    for ad in range(1, config.max_ads_per_page + 1):
        var_name = f"chosen_ad_y_{ad}_s" 
        data.loc[:, var_name] = np.nan


    for ad in range(1, config.max_ads_per_page + 1):
        var_name = f"chosen_ad_y_{ad}" 
        data.loc[:, var_name] = np.nan

    for ad in range(1, config.max_ads_per_page + 1):
        var_name = f"chosen_ad_te_{ad}" 
        data.loc[:, var_name] = np.nan

    for ad in range(1, config.max_ads_per_page + 1):
        var_name = f"chosen_ad_click_dummy_{ad}"
        data.loc[:, var_name] = np.nan
    data.loc[:, 'num_ads'] = np.nan




# This function is the same for split and non-split
def find_optimal_ads(row, y_cols):
    """
    This functions calculates optimal ads (based on highest treatment effects) to be shown to the impression in each row. based on the calculated treatment effects y_i s
    Inputs: 
        - row: the row of the dataframe that it is applied to
        it has to include indices y_cols and "ads_on_page" (determines how many ads to choose)
    
    Returns: 
        - chosen_ads: a list of ads to be shown
        - chosen_ad_ys: a list of the corresponding treatment effects
    """


    # sort the values by the value of the criteria
    sorted_ads = row[y_cols].sort_values(ascending=False).index.to_list()
    l = min(row['ads_on_page'], config.max_ads_per_page)    # number of ads to be shown on each visit
    chosen_ads = sorted_ads[0 : l]
    # creates a list of chosen ad ranks
    chosen_ads = [int(element[2: -2]) for element in chosen_ads] # this will turn y_25_s into 25!
    chosen_ad_ys = row[y_cols].sort_values(ascending=False).values[0:l]
    return chosen_ads, chosen_ad_ys



def create_chosen_ad_columns_split(data, split_no, user_visit_no):
    """
    This function finds the optimal ads for the subsection of "data" for which user_visit_no == user_visit_no
    The chosen ads and their corresponding click rates are saved in 'chosen_ad_{ad}' and 'chosen_ad_y_{ad}'
    """
    # select treatment effect columns
    # te_cols = data.loc[0: 1, :].filter(regex="^te_", axis=1).columns
    # select ctr columns:
    # y_cols = data.loc[0: 1, :].filter(regex="^y_", axis=1).columns
    y_cols = data.columns[data.columns.str.match(r"^y_.*_s$")]

    for index, row in data[(data['user_visit_no'] == user_visit_no) & (data['split'] == split_no)].iterrows():
        
        chosen_ads, chosen_ad_ys = find_optimal_ads(row, y_cols)
        chosen_ads = [int(element) for element in chosen_ads]
        l = len(chosen_ads)
        last_chosen_ad_name = f"chosen_ad_{l}"
        # last_chosen_ad_te_name = f"chosen_ad_te_{l}"
        last_chosen_ad_y_name = f"chosen_ad_y_{l}_s"
        data.loc[index, 'chosen_ad_1': last_chosen_ad_name] = chosen_ads
        data.loc[index, 'chosen_ad_y_1_s' : last_chosen_ad_y_name] = chosen_ad_ys
        data.at[index, 'num_ads'] = int(l)
        # if index % 10000 == 0:
        #     print(f"index {index} done!")

        
def calc_base_ad_actual_ctr(data, split_no, user_visit_no):
    """
    This function calculates E(y0|X=x) for the subset of DataFrame "data" for which the "user_visit_no" is a specific number.
    The output is saved in columns y_{base_ad} of the dataframe "data"
    """
    start_time = time.perf_counter()
    # Define X variables (Note that I am not using previous_clicks and i mpression_repeat variables here, because I'm only using base ad repeats and clicks here)
    X = data[['previous_clicks_all_ads',
        'impression_repeat_base_ad', 'previous_clicks_base_ad', 'total_visits',
        'visit_s1', 'visit_s2', 'visit_s3', 'visit_s4', 'visit_s5', 'visit_s6',
        'visit_s7', 'visit_s8', 'visit_s9', 'visit_s10', 'visit_s11',
        'visit_s12', 'visit_s13', 'visit_s14', 'visit_s15', 'visit_s16',
        'visit_s17', 'visit_s18', 'visit_s19', 'visit_s20', 'visit_s21',
        'visit_s22', 'visit_s23', 'visit_s24', 'visit_s25', 'visit_s26',
        'sub_1', 'sub_2', 'sub_3', 'sub_4', 'sub_5', 'sub_6', 'sub_7', 'sub_8',
        'sub_9', 'sub_10', 'sub_11', 'sub_12', 'sub_13', 'sub_14', 'sub_15',
        'sub_16', 'sub_17', 'sub_18', 'sub_19', 'sub_20', 'sub_21', 'sub_22',
        'sub_23', 'sub_24', 'sub_25', 'sub_26', 'mobile']]
    

    var_name = f"y_{base_ad}"
    if (len(data[(data['user_visit_no'] == user_visit_no) & (data['split'] == split_no)]) > 0):
        data.loc[(data['user_visit_no_s'] == user_visit_no) & (data['split'] == split_no), var_name] = config.base_ad_y_model.predict(X.loc[(data['user_visit_no_s'] == user_visit_no) & (data['split'] == split_no)])
    finish_time = time.perf_counter()
    # print(f"finished calculating y0 in {finish_time - start_time} seconds")



def calc_actual_tes_for_chosen_ads(data, index):
    tes_list =[]
    for chosen_ad_no in range(1, int(data.loc[index, 'num_ads']) + 1):
        # var_name = f"chosen_ad_te_{chosen_ad_no}"
        chosen_ad_var = f"chosen_ad_{chosen_ad_no}"
        chosen_ad = int(data.at[index, chosen_ad_var])
        X = data.loc[index: index, ['impression_repeat', 'previous_clicks', 'previous_clicks_all_ads',
        'impression_repeat_base_ad', 'previous_clicks_base_ad', 'total_visits',
        'visit_s1', 'visit_s2', 'visit_s3', 'visit_s4', 'visit_s5', 'visit_s6',
        'visit_s7', 'visit_s8', 'visit_s9', 'visit_s10', 'visit_s11',
        'visit_s12', 'visit_s13', 'visit_s14', 'visit_s15', 'visit_s16',
        'visit_s17', 'visit_s18', 'visit_s19', 'visit_s20', 'visit_s21',
        'visit_s22', 'visit_s23', 'visit_s24', 'visit_s25', 'visit_s26',
        'sub_1', 'sub_2', 'sub_3', 'sub_4', 'sub_5', 'sub_6', 'sub_7', 'sub_8',
        'sub_9', 'sub_10', 'sub_11', 'sub_12', 'sub_13', 'sub_14', 'sub_15',
        'sub_16', 'sub_17', 'sub_18', 'sub_19', 'sub_20', 'sub_21', 'sub_22',
        'sub_23', 'sub_24', 'sub_25', 'sub_26', 'mobile']]
        
        # #################################### this is for fixing the missing variable sub_24, sub_25, sub_26 on the second split when training the model. Remove this when you fix this problem:
        # if row['split'] == 2:
        #     X = X.drop(['sub_24', 'sub_25', 'sub_26'])

    # a) construct base ad's initial clicks and repeats
        base_ad_str = f"r_{base_ad}"
        X['impression_repeat_base_ad'] = data.loc[index, base_ad_str] + 1  # +1 is because r_* shows previous impressions, but impression repeat is the number of repeats (including current one)

        base_ad_str = f"c_{base_ad}"
        X['previous_clicks_base_ad'] =data.loc[index, base_ad_str]

    # b) construct chosen ad's initial clicks and repeats
        str = f"r_{chosen_ad}"
        X['impression_repeat'] = data.loc[index, str] + 1  # +1 is because r_* shows previous impressions, but impression repeat is the number of repeats (including current one)
        str = f"c_{chosen_ad}"
        X['previous_clicks'] = data.loc[index, str]
        if chosen_ad != base_ad:
            exec(f"tes_list.append(config.cf_{chosen_ad}.const_marginal_effect(X))")
        else:
            tes_list.append(np.array([[0]]))
    return np.concatenate(tes_list).flatten()



def calc_actual_ctrs_for_chosen_ads(data, split_no, user_visit_no):
    for index, row in (data[(data['split'] == split_no) & (data['user_visit_no'] == user_visit_no)]).iterrows():
        # if (index % 100 == 0):
        #     print(index)
        tes_list =  calc_actual_tes_for_chosen_ads(data, index)
        l = len(tes_list)
        last_chosen_ad_te_name = f"chosen_ad_te_{l}"
        data.loc[index, 'chosen_ad_te_1' : last_chosen_ad_te_name] = tes_list
        last_chosen_ad_y_name = f"chosen_ad_y_{l}"
        base_ad_ctr_var = f"y_{base_ad}"
        data.loc[index, 'chosen_ad_y_1' : last_chosen_ad_y_name] = data.loc[index, 'chosen_ad_te_1' : last_chosen_ad_te_name] + data.loc[index, base_ad_ctr_var]
        



def update_repeats_on_main_and_split(data, split_no, user_visit_no):

    """
    This function updates the number of previous impression on data after user visit number user_visit_no.
    For example, after a user visits a page for the first time, and observes optimal ads (say ads 2, 5, 10), the initial impressions for all subsequent visits of that user, the number of previous impressions on ads 2, 5, 10 increases by 1. 
    """
    for index, row in data[(data['user_visit_no'] == user_visit_no) & (data['split'] == split_no)].iterrows():

        for chosen_ad_no in range(1, int(row['num_ads']) + 1):
            var_name = f"chosen_ad_{chosen_ad_no}"
            chosen_ad = int(row[var_name])
            col_name_main = f'r_{chosen_ad}'
            col_name_split = f'r_{chosen_ad}_s'
            print(col_name_main)
            data.loc[((data['global_token_new'] == row['global_token_new']) & (data['user_visit_no'] > row['user_visit_no'])), col_name_main] = row[col_name_main] + 1 # update actual repeats on all subsequent impressions of the user
            data.loc[((data['global_token_new'] == row['global_token_new']) & (data['user_visit_no'] > row['user_visit_no']) & (data['split'] == row['split'])), col_name_split] = row[col_name_split] + 1 # update split repeats on subsequent impressions of the user only if it is on the same split (platform)



def update_clicks_on_main_and_split(data, split_no, user_visit_no):
    """
    This function updates the number of previous clicks on data after user visit number user_visit_no.
    For example, after a user visits a page for the first time, and clicks on ad 5, c_5 increases by 1 for all subsequent user impressions. 
    It also updates the column "previous_clicks_all_ads"
    """

    for index, row in data[(data['user_visit_no'] == user_visit_no) & (data['split'] == split_no)].iterrows():
        total_clicks_on_impression = 0
        for chosen_ad_no in range(1, int(row['num_ads']) + 1):
            var_name = f"chosen_ad_{chosen_ad_no}"
            chosen_ad = int(row[var_name])
            ctr_var = f'chosen_ad_y_{chosen_ad_no}'
            col_name_main = f'c_{chosen_ad}' # the column name to be updated (if ad 5 is clicked on, c_5 will increase by 1 for all subsequent impressions)
            col_name_split = f'c_{chosen_ad}_s' # the column name to be updated (if ad 5 is clicked on, c_5_s will increase by 1 for all subsequent impressions)
            click_dummy_var =f'chosen_ad_click_dummy_{chosen_ad_no}'
            rand_click = np.random.rand()   # a random number simulating user's click. User will click if rand_click < y_{chosen_ad}
            print(data.at[index, ctr_var])
            data.loc[index, click_dummy_var] = int(rand_click <= row[ctr_var])
            total_clicks_on_impression += data.loc[index, click_dummy_var]
            data.loc[((data['global_token_new'] == row['global_token_new']) & (data['user_visit_no'] > row['user_visit_no'])), col_name_main] = int(row[col_name_main] + data.loc[index, click_dummy_var])
            data.loc[((data['global_token_new'] == row['global_token_new']) & (data['user_visit_no'] > row['user_visit_no']) & (data['split'] == row['split'])), col_name_split] = int(row[col_name_split] + data.loc[index, click_dummy_var]) # update only if it is on the same split (platform)
        data.loc[((data['global_token_new'] == row['global_token_new']) & (data['user_visit_no'] > row['user_visit_no'])), 'previous_clicks_all_ads'] = int(row['previous_clicks_all_ads'] + total_clicks_on_impression)
        data.loc[((data['global_token_new'] == row['global_token_new']) & (data['user_visit_no'] > row['user_visit_no']) & (data['split'] == row['split'])), 'previous_clicks_all_ads_s'] = int(row['previous_clicks_all_ads'] + total_clicks_on_impression)  # update only if it is on the same split (platform)





In [26]:

def simulate(data):
    # file_name = f"data_chunk_{chunk}"
    # create empty columns in the dataframe to fill later
    create_chosen_split_ad_vars(data)

    
    # print(f"\n\n\n=======> Chunk #{chunk}")
    start_time = time.perf_counter()


    for i in range(1, range(1, max_visit_no + 1):

        start_time_1 = time.perf_counter()
        print(f"\n\n --->Repeat #{i}:")
        # 1) calculate treatment effects, and base ad ctr, then sum them sup and create ctrs for all ads
        start_time = time.perf_counter()
        # a) calc TEs and CTRs on s1
        calc_split_tes(data, split_no=1, user_visit_no=i, ranks_list=config.ranks_list)
        calc_base_ad_split_ctr(data,split_no=1, user_visit_no=i)
        calc_split_ctrs(data, split_no=1, user_visit_no=i, ranks_list=config.ranks_list)

        # b) calc TEs and CTRs on s2
        calc_split_tes(data, split_no=2, user_visit_no=i, ranks_list=config.ranks_list)
        calc_base_ad_split_ctr(data,split_no=2, user_visit_no=i)
        calc_split_ctrs(data, split_no=2, user_visit_no=i, ranks_list=config.ranks_list)

        finish_time = time.perf_counter()
        print(f"Step 1 of repeat {i} finished in {finish_time - start_time} seconds!")
        # 2) determine what ads are chosen
        start_time = time.perf_counter()
        # find the optimal ads and save them and their corresponding ctr's in the dataframe
        # on s1
        create_chosen_ad_columns_split(data, split_no=1, user_visit_no=i)
        

        # on s2
        create_chosen_ad_columns_split(data, split_no=2, user_visit_no=i)
        finish_time = time.perf_counter()
        print(f"Step 2 of repeat {i} finished in {finish_time - start_time} seconds!")
        # 3) Calculate actual tes and ctrs for the chosen ads
        start_time = time.perf_counter()

        calc_base_ad_actual_ctr(data, split_no=1, user_visit_no=i)
        calc_base_ad_actual_ctr(data, split_no=2, user_visit_no=i)
        calc_actual_ctrs_for_chosen_ads(data, split_no=1, user_visit_no=i)
        calc_actual_ctrs_for_chosen_ads(data, split_no=2, user_visit_no=i)

        finish_time = time.perf_counter()
        print(f"Step 3 of repeat {i} finished in {finish_time - start_time} seconds!")
        # 4) Update repeats
        start_time = time.perf_counter()

        update_repeats_on_main_and_split(data, split_no=1, user_visit_no=i)
        update_repeats_on_main_and_split(data, split_no=2, user_visit_no=i)

        finish_time = time.perf_counter()
        print(f"Step 4 of repeat {i} finished in {finish_time - start_time} seconds!")

        # 5) Update clicks
        start_time = time.perf_counter()

        update_clicks_on_main_and_split(data, split_no=1, user_visit_no=i)
        update_clicks_on_main_and_split(data, split_no=2, user_visit_no=i)        

        finish_time = time.perf_counter()
        print(f"Step 5 of repeat {i} finished in {finish_time - start_time} seconds!")
        finish_time_1 = time.perf_counter()
        print("Repeat {i} of {data} finished in  {finish_time - start_time} seconds!")

    finish_time = time.perf_counter()
    print(f"All Repeats finished in {finish_time - start_time} seconds!")
    return data



In [29]:
simulate(data_chunk_17)



 --->Repeat #1:
finished calculating te's for rank 100 in 53.29604760000075 seconds
finished calculating te's for rank 100 in 51.42708649999986 seconds
Step 1 of repeat 1 finished in 115.83069590000378 seconds!
Step 2 of repeat 1 finished in 18.351519100004225 seconds!
Step 3 of repeat 1 finished in 2337.200825599997 seconds!
r_66
r_18
r_13
r_9
r_11
r_66
r_13
r_45
r_73
r_23
r_11
r_15
r_63
r_73
r_66
r_42
r_18
r_73
r_94
r_20
r_81
r_87
r_45
r_63
r_11
r_73
r_23
r_66
r_18
r_46
r_66
r_45
r_13
r_64
r_12
r_62
r_23
r_15
r_4
r_66
r_13
r_45
r_23
r_11
r_64
r_99
r_15
r_83
r_18
r_30
r_20
r_45
r_63
r_73
r_34
r_23
r_1
r_42
r_70
r_31
r_73
r_45
r_70
r_83
r_74
r_34
r_23
r_42
r_18
r_45
r_70
r_83
r_42
r_45
r_63
r_11
r_73
r_73
r_95
r_20
r_94
r_78
r_45
r_87
r_18
r_83
r_90
r_43
r_23
r_4
r_66
r_46
r_73
r_63
r_9
r_18
r_83
r_11
r_70
r_45
r_73
r_74
r_23
r_13
r_34
r_63
r_94
r_4
r_38
r_70
r_100
r_42
r_42
r_83
r_45
r_80
r_18
r_9
r_11
r_70
r_23
r_45
r_1
r_83
r_13
r_20
r_87
r_63
r_56
r_81
r_73
r_90
r_95
r_42
r_42
r_

Unnamed: 0,global_token_new,impression_timestamp,publisher_rank,user_visit_no_actual,c_1_s,c_2_s,c_3_s,c_4_s,c_5_s,c_6_s,...,y_91_s,y_92_s,y_93_s,y_94_s,y_95_s,y_96_s,y_98_s,y_99_s,y_100_s,y_50
2419358,1600000.0,2012-09-07 07:32:44,10,1,0,0,0,0,0,0,...,0.00165,0.000728,0.000225,0.003314,0.000947,0.000341,0.000339,0.001748,0.001391,0.001417
2419359,1600000.0,2012-09-07 07:36:01,39,3,0,0,0,0,0,0,...,,,,,,,,,,
2419360,1600000.0,2012-09-07 07:37:16,255,4,0,0,0,0,0,0,...,,,,,,,,,,
2419361,1600000.0,2012-09-07 07:37:47,46,5,0,0,0,0,0,0,...,,,,,,,,,,
2419362,1600000.0,2012-09-07 08:34:27,35,6,0,0,0,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4816658,1617554.0,2012-09-08 08:16:29,140,67,0,0,0,0,0,0,...,,,,,,,,,,
4816659,1617554.0,2012-09-08 08:17:30,140,68,0,0,0,0,0,0,...,,,,,,,,,,
4816660,1617554.0,2012-09-08 08:17:49,140,69,0,0,0,0,0,0,...,,,,,,,,,,
4816661,1617554.0,2012-09-08 08:19:14,140,70,0,0,0,0,0,0,...,,,,,,,,,,


In [207]:
def calc_actual_tes_for_chosen_ads(data, index):
    tes_list =[]
    for chosen_ad_no in range(1, int(data.loc[index, 'num_ads']) + 1):
        # var_name = f"chosen_ad_te_{chosen_ad_no}"
        chosen_ad_var = f"chosen_ad_{chosen_ad_no}"
        chosen_ad = int(data.at[index, chosen_ad_var])
        X = data.loc[index: index, ['impression_repeat', 'previous_clicks', 'previous_clicks_all_ads',
        'impression_repeat_base_ad', 'previous_clicks_base_ad', 'total_visits',
        'visit_s1', 'visit_s2', 'visit_s3', 'visit_s4', 'visit_s5', 'visit_s6',
        'visit_s7', 'visit_s8', 'visit_s9', 'visit_s10', 'visit_s11',
        'visit_s12', 'visit_s13', 'visit_s14', 'visit_s15', 'visit_s16',
        'visit_s17', 'visit_s18', 'visit_s19', 'visit_s20', 'visit_s21',
        'visit_s22', 'visit_s23', 'visit_s24', 'visit_s25', 'visit_s26',
        'sub_1', 'sub_2', 'sub_3', 'sub_4', 'sub_5', 'sub_6', 'sub_7', 'sub_8',
        'sub_9', 'sub_10', 'sub_11', 'sub_12', 'sub_13', 'sub_14', 'sub_15',
        'sub_16', 'sub_17', 'sub_18', 'sub_19', 'sub_20', 'sub_21', 'sub_22',
        'sub_23', 'sub_24', 'sub_25', 'sub_26', 'mobile']]
        
        # #################################### this is for fixing the missing variable sub_24, sub_25, sub_26 on the second split when training the model. Remove this when you fix this problem:
        # if row['split'] == 2:
        #     X = X.drop(['sub_24', 'sub_25', 'sub_26'])

    # a) construct base ad's initial clicks and repeats
        base_ad_str = f"r_{base_ad}"
        X['impression_repeat_base_ad'] = data.loc[index, base_ad_str] + 1  # +1 is because r_* shows previous impressions, but impression repeat is the number of repeats (including current one)

        base_ad_str = f"c_{base_ad}"
        X['previous_clicks_base_ad'] =data.loc[index, base_ad_str]

    # b) construct chosen ad's initial clicks and repeats
        str = f"r_{chosen_ad}"
        X['impression_repeat'] = data.loc[index, str] + 1  # +1 is because r_* shows previous impressions, but impression repeat is the number of repeats (including current one)
        str = f"c_{chosen_ad}"
        X['previous_clicks'] = data.loc[index, str]

        exec(f"tes_list.append(config.cf_{chosen_ad}.const_marginal_effect(X))")
    return np.concatenate(tes_list).flatten()


    


In [238]:
def calc_actual_ctrs_for_chosen_ads(data, split_no, user_visit_no):
    for index, row in (data[(data['split'] == split_no) & (data['user_visit_no'] == user_visit_no)]).iterrows():
        if (index % 100 == 0):
            print(index)
        tes_list =  calc_actual_tes_for_chosen_ads(data, index)
        l = len(tes_list)
        last_chosen_ad_te_name = f"chosen_ad_te_{l}"
        data.loc[index, 'chosen_ad_te_1' : last_chosen_ad_te_name] = tes_list
        last_chosen_ad_y_name = f"chosen_ad_y_{l}"
        base_ad_ctr_var = f"y_{base_ad}"
        data.loc[index, 'chosen_ad_y_1' : last_chosen_ad_y_name] = data.loc[index, 'chosen_ad_te_1' : last_chosen_ad_te_name] + data.loc[index, base_ad_ctr_var]
        

        

In [218]:
calc_base_ad_ctr(data_mini, user_visit_no=1)

In [248]:
data_mini.filter(regex="y_50").columns

Index(['y_50_s', 'y_50'], dtype='object')

In [251]:
data_mini['y_50']

0       0.003252
1            NaN
2       0.003066
3            NaN
4            NaN
          ...   
996          NaN
997          NaN
998          NaN
999          NaN
1000         NaN
Name: y_50, Length: 1001, dtype: float64

In [None]:
# start_time = time.perf_counter()
# def simulate(data):
#     # file_name = f"data_chunk_{chunk}"
#     # create empty columns in the dataframe to fill later
#     create_chosen_ad_vars_duopoly(data)

    
#     # print(f"\n\n\n=======> Chunk #{chunk}")
#     start_time = time.perf_counter()


#     for i in range(1, 2): #range(1, max_visit_no + 1):

#         start_time_1 = time.perf_counter()
#         print(f"\n\n --->Repeat #{i}:")
#         # 1) calculate treatment effects, and base ad ctr, then sum them sup and create ctrs for all ads
#         # start_time = time.perf_counter()
#         # a) calc TEs and CTRs on s1
#         calc_tes(data=data_s1, user_visit_no=i, ranks_list=config.ranks_list)
#         calc_base_ad_ctr(data=data_s1, user_visit_no=i)
#         calc_ctrs(data=data_s1, user_visit_no=i)

#         # b) calc TEs and CTRs on s2
#         calc_tes(data=data_s2, user_visit_no=i, ranks_list=config.ranks_list)
#         calc_base_ad_ctr(data=data_s2, user_visit_no=i)
#         calc_ctrs(data=data_s2, user_visit_no=i)


#         # 2) determine what ads are chosen
#         # a. create empty columns in the dataframe to fill later
#         start_time_2 = time.perf_counter()
#         # find the optimal ads and save them and their corresponding ctr's in the dataframe
#         # on s1
#         create_chosen_ad_columns(data= data_s1, user_visit_no=i)
#         finish_time_2 = time.perf_counter()

#         # on s2
#         create_chosen_ad_columns(data= data_s2, user_visit_no=i)
  
#         # 3) Update repeats and clicks for the next impressions
#         # start_time_1 = time.perf_counter()
#         # update_repeats(data, user_visit_no=i)
#         # update_clicks(data, user_visit_no=i)

#         # finish_time_1 = time.perf_counter()

#         # print(f"Repeat {i} finished in {finish_time_1 - start_time_1} seconds!")


#     finish_time = time.perf_counter()
#     print(f"All Repeats finished in {finish_time - start_time} seconds!")
#     return data


In [None]:

simulate(data_main_chunk_1, data_s1_chunk_1, data_s2_chunk_1)




 --->Repeat #1:
finished calculating te's for rank 100 in 65.86257810000006 seconds
finished calculating te's for rank 100 in 74.58812409999996 seconds
All Repeats finished in 252.20644700000003 seconds!


Unnamed: 0,global_token_new,impression_timestamp,publisher_rank,user_visit_no_actual,c_1,c_2,c_3,c_4,c_5,c_6,...,chosen_ad_click_dummy_7,chosen_ad_click_dummy_8,chosen_ad_click_dummy_9,chosen_ad_click_dummy_10,chosen_ad_click_dummy_11,chosen_ad_click_dummy_12,chosen_ad_click_dummy_13,chosen_ad_click_dummy_14,chosen_ad_click_dummy_15,num_ads
0,3.0,2012-09-07 17:31:04,631,1,0,0,0,0,0,0,...,,,,,,,,,,
1,3.0,2012-09-07 17:31:23,631,2,0,0,0,0,0,0,...,,,,,,,,,,
2,6.0,2012-09-07 08:59:56,258,1,0,0,0,0,0,0,...,,,,,,,,,,
3,6.0,2012-09-07 10:48:09,102,8,0,0,0,0,0,0,...,,,,,,,,,,
4,6.0,2012-09-07 10:57:51,27,9,0,0,0,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2445552,1617553.0,2012-09-07 20:18:48,131,4,0,0,0,0,0,0,...,,,,,,,,,,
2445553,1617553.0,2012-09-08 14:54:59,170,6,0,0,0,0,0,0,...,,,,,,,,,,
2445554,1617554.0,2012-09-07 16:55:38,26,1,0,0,0,0,0,0,...,,,,,,,,,,
2445555,1617555.0,2012-09-08 10:39:36,358,1,0,0,0,0,0,0,...,,,,,,,,,,


In [None]:
def simulate_split_data(data_s, data_main, user_visit_no):
    """
    This function takes the chosen ads on data_s, and calculates their (actual) ctr's from the main data. 
    """

    for index, row in data_s[data_s['user_visit_no'] == user_visit_no].iterrows():
        user_id = row['global_token_new']
        visit_no = row['user_visit_no']
        # find the row in data_main with the same user_id and visit_no as row:
        data_main_index = data_main[(data_main['global_token_new'] == user_id) & (data_main['user_visit_no'] == visit_no)].index[0]
        data_main_row = data_main[(data_main['global_token_new'] == user_id) & (data_main['user_visit_no'] == visit_no)].iloc[0]
        last_var = f"chosen_ad_{config.max_ads_per_page}"
        data_main.loc[data_main_index, 'chosen_ad_1': last_var] = data_s.loc[index, 'chosen_ad_1': last_var]
        data_main.at[data_main_index, 'num_ads'] = data_s.at[index, 'num_ads']
        # print(data_main.loc[data_main_index, 'chosen_ad_1': last_var])
        # now calculate the y for the chosen ads
        # first calculate y0
        X = data_main.loc[data_main_index: data_main_index, ['previous_clicks_all_ads',
        'impression_repeat_base_ad', 'previous_clicks_base_ad', 'total_visits',
        'visit_s1', 'visit_s2', 'visit_s3', 'visit_s4', 'visit_s5', 'visit_s6',
        'visit_s7', 'visit_s8', 'visit_s9', 'visit_s10', 'visit_s11',
        'visit_s12', 'visit_s13', 'visit_s14', 'visit_s15', 'visit_s16',
        'visit_s17', 'visit_s18', 'visit_s19', 'visit_s20', 'visit_s21',
        'visit_s22', 'visit_s23', 'visit_s24', 'visit_s25', 'visit_s26',
        'sub_1', 'sub_2', 'sub_3', 'sub_4', 'sub_5', 'sub_6', 'sub_7', 'sub_8',
        'sub_9', 'sub_10', 'sub_11', 'sub_12', 'sub_13', 'sub_14', 'sub_15',
        'sub_16', 'sub_17', 'sub_18', 'sub_19', 'sub_20', 'sub_21', 'sub_22',
        'sub_23', 'sub_24', 'sub_25', 'sub_26', 'mobile']]

        
        base_ad_ctr_var = f"y_{base_ad}"
        base_ad_te_var = f"te_{base_ad}"

        data_main.loc[data_main_index, base_ad_ctr_var] = config.base_ad_y_model.predict(X)
        data_main.loc[data_main_index, base_ad_te_var] = 0

        # then, calculate TEs and CTRs for chosen ads
        for chosen_ad_no in range(1, int(row['num_ads']) + 1):
            var_name = f"chosen_ad_{chosen_ad_no}"
            chosen_ad = int(data_s.at[index, var_name])
            if chosen_ad != base_ad:
                te_var = f'te_{chosen_ad}'
                ctr_var = f'y_{chosen_ad}'


                # Construct X variable for the input to the causal forest
                X = construct_X(data=data_main.loc[data_main_index:data_main_index, :], user_visit_no=user_visit_no, ad_rank=chosen_ad)
                # estimate the TE for X and ad number {chosen_ad}
                exec(f"data_main.loc[data_main_index, te_var] = config.cf_{chosen_ad}.const_marginal_effect(X)")
                # calculate the ctr
                data_main.loc[data_main_index, ctr_var] = data_main.loc[data_main_index, te_var] + data_main.loc[data_main_index, base_ad_ctr_var]
                # copy the ctr to split dataset
                ctr_var_s = f'y_{chosen_ad}_actual'
                data_s.loc[index, ctr_var_s] = data_main.loc[data_main_index, ctr_var]

        #update repeats on main dataset
        update_repeats_on_main_data(data_main, data_main_index)

        # update clicks on both main and split datasets
        total_clicks_on_impression = 0
        for chosen_ad_no in range(1, int(row['num_ads']) + 1):
            var_name = f"chosen_ad_{chosen_ad_no}"
            chosen_ad = int(data_s.at[index, var_name])
            ctr_var = f'y_{chosen_ad}'
            col_name = f'c_{chosen_ad}' # the column name to be updated (if ad 5 is clicked on, c_5 will increase by 1 for all subsequent impressions)
            click_dummy_var =f'chosen_ad_click_dummy_{chosen_ad_no}'
            rand_click = np.random.rand()   # a random number simulating user's click. User will click if rand_click < y_{chosen_ad}
            data_main.at[data_main_index, click_dummy_var] = int(rand_click <= data_main.at[data_main_index, ctr_var])
            data_s.at[index, click_dummy_var] = data_main.at[data_main_index, click_dummy_var]
            total_clicks_on_impression += data_main.loc[data_main_index, click_dummy_var]
            
            # update click on main data
            data_main.loc[((data_main['global_token_new'] == data_main.at[data_main_index, 'global_token_new'])
                            & (data_main['user_visit_no'] > data_main.at[data_main_index, 'user_visit_no'])), col_name] = int(data_main.at[data_main_index, col_name] + data_main.at[data_main_index, click_dummy_var])

            # update click on split data
            data_s.loc[((data_s['global_token_new'] == data_s.at[index, 'global_token_new'])
                            & (data_s['user_visit_no'] > data_s.at[index, 'user_visit_no'])), col_name] = int(data_s.at[index, col_name] + data_s.at[index, click_dummy_var])


        # update total number of clicks on both datasets
        # a) main data
        data_main.loc[((data_main['global_token_new'] == data_main.at[data_main_index, 'global_token_new']) 
                        & (data_main['user_visit_no'] > data_main.at[data_main_index, 'user_visit_no'])), 'previous_clicks_all_ads'] = int(data_main.at[data_main_index, 'previous_clicks_all_ads'] + total_clicks_on_impression)

        # b) split data
        data_s.loc[((data_s['global_token_new'] == data_s.at[index, 'global_token_new']) 
                        & (data_s['user_visit_no'] > data_s.at[index, 'user_visit_no'])), 'previous_clicks_all_ads'] = int(data_s.at[index, 'previous_clicks_all_ads'] + total_clicks_on_impression)



    # update repeats on split dataset (Note that since I have a pre-written function for this, I am doing this update separately from other updates and outside the for loop for all rows)
    update_repeats(data_s, user_visit_no)






In [None]:

def update_repeats_on_main_data(data_main, data_main_index):
    """
    This function updates the number of previous impression on data after user visit number user_visit_no in the main dataset. This is used to update the actual number of repeats 
    For example, after a user visits a page for the first time, and observes optimal ads (say ads 2, 5, 10), the initial impressions for all subsequent visits of that user, the number of previous impressions on ads 2, 5, 10 increases by 1. 
    """
    for chosen_ad_no in range(1, int(data_main.at[data_main_index, 'num_ads']) + 1):
        var_name = f"chosen_ad_{chosen_ad_no}"
        chosen_ad = int(data_main.at[data_main_index, var_name])
        col_name = f'r_{chosen_ad}'
        data_main.loc[((data_main['global_token_new'] == (data_main.at[data_main_index, 'global_token_new'])) & (data_main['user_visit_no'] > data_main.at[data_main_index, 'user_visit_no'])), col_name] = data_main.at[data_main_index, col_name] + 1




In [None]:
simulate_split_data(data_s1_chunk_1, data_main_chunk_1, user_visit_no=1)

ValueError: cannot convert float NaN to integer

In [None]:
data_main_chunk_1.previous_clicks_all_ads

0         0
1         0
2         0
3         0
4         0
         ..
299622    0
299623    0
299624    0
299625    0
299626    0
Name: previous_clicks_all_ads, Length: 299627, dtype: int8

In [None]:
X = data_main.loc[0:0, ['impression_repeat', 'previous_clicks', 'previous_clicks_all_ads',
        'impression_repeat_base_ad', 'previous_clicks_base_ad', 'total_visits',
        'visit_s1', 'visit_s2', 'visit_s3', 'visit_s4', 'visit_s5', 'visit_s6',
        'visit_s7', 'visit_s8', 'visit_s9', 'visit_s10', 'visit_s11',
        'visit_s12', 'visit_s13', 'visit_s14', 'visit_s15', 'visit_s16',
        'visit_s17', 'visit_s18', 'visit_s19', 'visit_s20', 'visit_s21',
        'visit_s22', 'visit_s23', 'visit_s24', 'visit_s25', 'visit_s26',
        'sub_1', 'sub_2', 'sub_3', 'sub_4', 'sub_5', 'sub_6', 'sub_7', 'sub_8',
        'sub_9', 'sub_10', 'sub_11', 'sub_12', 'sub_13', 'sub_14', 'sub_15',
        'sub_16', 'sub_17', 'sub_18', 'sub_19', 'sub_20', 'sub_21', 'sub_22',
        'sub_23', 'sub_24', 'sub_25', 'sub_26', 'mobile']]

In [None]:
X

Unnamed: 0,impression_repeat,previous_clicks,previous_clicks_all_ads,impression_repeat_base_ad,previous_clicks_base_ad,total_visits,visit_s1,visit_s2,visit_s3,visit_s4,...,sub_18,sub_19,sub_20,sub_21,sub_22,sub_23,sub_24,sub_25,sub_26,mobile
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
base_ad_str = f"r_{base_ad}"
X['impression_repeat_base_ad'] = data_main.loc[data_main_index, base_ad_str] + 1  # +1 is because r_* shows previous impressions, but impression repeat is the number of repeats (including current one)

base_ad_str = f"c_{base_ad}"
X['previous_clicks_base_ad'] = data_main.loc[data_main_index, base_ad_str]

# b) construct each ad's initial clicks and repeats
str = f"r_{chosen_ad}"
X ['impression_repeat'] = data_main.loc[data_main_index, str] + 1  # +1 is because r_* shows previous impressions, but impression repeat is the number of repeats (including current one)
str = f"c_{chosen_ad}"
X['user_visit_no'] = data_main.loc[data_main_index, str]

NameError: name 'data_main_index' is not defined

In [None]:
config.cf_83.const_marginal_effect(X)

AssertionError: Dimension mis-match of X with fitted X