In [1]:
import numpy as np
import pandas as pd
from econml.dml import CausalForestDML
import matplotlib.pyplot as plt
import os
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.base import BaseEstimator
from econml.sklearn_extensions.model_selection import GridSearchCVList
import time
import joblib
import multiprocessing
import pickle


from warnings import simplefilter 
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

import config
from utils import *

  def _pt_shuffle_rec(i, indexes, index_mask, partition_tree, M, pos):
  def delta_minimization_order(all_masks, max_swap_size=100, num_passes=2):
  def _reverse_window(order, start, length):
  def _reverse_window_score_gain(masks, order, start, length):
  def _mask_delta_score(m1, m2):
  def identity(x):
  def _identity_inverse(x):
  def logit(x):
  def _logit_inverse(x):
  def _build_fixed_single_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _build_fixed_multi_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _init_masks(cluster_matrix, M, indices_row_pos, indptr):
  def _rec_fill_masks(cluster_matrix, indices_row_pos, indptr, indices, M, ind):
  def _single_delta_mask(dind, masked_inputs, last_mask, data, x, noop_code):
  def _delta_masking(masks, x, curr_delta_inds, varying_rows_out,
  def _jit_build_partition_tree(xmin, xmax, ymi

rank 20 model loaded!
rank 40 model loaded!
rank 60 model loaded!
rank 80 model loaded!
rank 100 model loaded!


In [2]:

base_ad = 50
max_adv_rank = 100

# read data
data = pd.read_stata("..\\data\\Simulation Data - Last 2 Days.dta")


In [3]:

# Chunk the data
chunk_users_num = 100
n_chunks = 1 #int(data.global_token_new.max() / chunk_users_num) + 1

data['chunk'] = ((data['global_token_new'] / chunk_users_num).astype(int) + 1)

# create data chunks: data_chunk_1, ...
for chunk in range(1, n_chunks + 1):
    # var_name = f"data_chunk_{chunk}"
    exec(f"data_chunk_{chunk} = data[data['chunk']==chunk]")


In [4]:
pd.options.mode.chained_assignment = None

start_time_1 = time.perf_counter()
for chunk in range(1, n_chunks + 1):

    file_name = f"data_chunk_{chunk}"
    exec(f"create_chosen_ad_vars({file_name})")
    start_time = time.perf_counter()

    for i in range(1, 10):

        # print(f"Repeat #{i}:")
        start_time = time.perf_counter()

        # 1) calculate treatment effects, and base ad ctr, then sum them sup and create ctrs for all ads
        # start_time = time.perf_counter()
        exec(f"calc_tes(data={file_name}, user_visit_no={i}, ranks_list=config.ranks_list)")
        exec(f"calc_base_ad_ctr({file_name}, user_visit_no={i})")
        exec(f"calc_ctrs({file_name}, user_visit_no={i})")
        # finish_time = time.perf_counter()
        # print(f"Stage 1 of repeat {i} finished in {finish_time - start_time} seconds!")

        # 2) determine what ads are chosen

        # a. find the optimal ads and save them and their corresponding ctr's in the dataframe
        exec(f"create_chosen_ad_columns({file_name}, user_visit_no={i})")

        # finish_time = time.perf_counter()
        # print(f"Stage 2 of repeat {i} finished in {finish_time - start_time} seconds!")

        # 3) Update repeats and clicks for the next impressions
        exec(f"update_repeats({file_name}, user_visit_no={i})")
        print(globals()[file_name].loc[:,'c_94'])
        # exec(f"update_clicks({file_name}, user_visit_no=i)")
        # df = globals() [file_name]
        # update_clicks(df, user_visit_no=i)
        # update_clicks(data_chunk_1, user_visit_no=1)
        # print(globals()[file_name].loc[:,'c_94'])
  
#     finish_time = time.perf_counter()
#     print(f"Chunk {chunk} out of {n_chunks} finished in {finish_time - start_time} seconds!")

# finish_time_1 = time.perf_counter()
# print(f"Repeat {i} finished in {finish_time_1 - start_time_1} seconds!")

# update_clicks(data_chunk_1, user_visit_no=1)
# update_clicks(data_chunk_1, user_visit_no=1)




finished calculating te's for rank 100 in 8.948497500000002 seconds
0      0
1      0
2      0
3      0
4      0
      ..
347    0
348    0
349    0
350    0
351    0
Name: c_94, Length: 352, dtype: int8
finished calculating te's for rank 100 in 9.612329799999998 seconds
0      0
1      0
2      0
3      0
4      0
      ..
347    0
348    0
349    0
350    0
351    0
Name: c_94, Length: 352, dtype: int8
finished calculating te's for rank 100 in 9.710162300000007 seconds
0      0
1      0
2      0
3      0
4      0
      ..
347    0
348    0
349    0
350    0
351    0
Name: c_94, Length: 352, dtype: int8
finished calculating te's for rank 100 in 9.480399399999996 seconds
0      0
1      0
2      0
3      0
4      0
      ..
347    0
348    0
349    0
350    0
351    0
Name: c_94, Length: 352, dtype: int8
finished calculating te's for rank 100 in 9.881804500000001 seconds
0      0
1      0
2      0
3      0
4      0
      ..
347    0
348    0
349    0
350    0
351    0
Name: c_94, Lengt

In [84]:

def update_clicks(data, user_visit_no):
    """
    This function updates the number of previous clicks on data after user visit number user_visit_no.
    For example, after a user visits a page for the first time, and clicks on ad 5, c_5 increases by 1 for all subsequent user impressions. 
    It also updates the column "previous_clicks_all_ads"
    """

    for index, row in data[data['user_visit_no'] == user_visit_no].iterrows():
        total_clicks_on_impression = 0
        for chosen_ad_no in range(1, int(row['num_ads']) + 1):
            var_name = f"chosen_ad_{chosen_ad_no}"
            chosen_ad = int(row[var_name])
            ctr_var = f'y_{chosen_ad}'
            col_name = f'c_{chosen_ad}' # the column name to be updated (if ad 5 is clicked on, c_5 will increase by 1 for all subsequent impressions)
            click_dummy_var =f'chosen_ad_click_dummy_{chosen_ad_no}'
            rand_click = np.random.rand()   # a random number simulating user's click. User will click if rand_click < y_{chosen_ad}
            data.loc[index, click_dummy_var] = int(rand_click <= row[ctr_var])
            total_clicks_on_impression = data.loc[index, click_dummy_var]
            
            
            data.loc[((data['global_token_new'] == row['global_token_new']) & (data['user_visit_no'] > row['user_visit_no'])), col_name] = int(row[col_name] + data.loc[index, click_dummy_var])
        data.loc[((data['global_token_new'] == row['global_token_new']) & (data['user_visit_no'] > row['user_visit_no'])), 'previous_clicks_all_ads'] = int(row['previous_clicks_all_ads'] + total_clicks_on_impression)
        # if index % 10000 == 0:
        #     print(f"index {index} done!")    
    # print(x) # x shows the total number of clicks (if uncommented)
    # return data




In [85]:
update_clicks(data_chunk_1, user_visit_no=1)

In [None]:
i = 1
file_name = f"data_chunk_{i}"
print(file_name)
exec(f"update_clicks({file_name}, user_visit_no=i)")

data_chunk_1
index 0 done!


In [None]:
update_clicks(data_chunk_1, user_visit_no=1)

In [None]:
np.sum(data_chunk_1.loc[:,'c_94'])

3.0