## Input Processing for Propensity Estimators

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

Causal forests and X Learner input

In [2]:
fold = 'Fold1'

In [3]:
sim_exp_train_vali_rankings = pd.read_csv(f'../build/simulation/{fold}/sim_exp_train_vali_rankings.csv')

In [4]:
nqids = sim_exp_train_vali_rankings['qid'].nunique()
unique_qids = sim_exp_train_vali_rankings['qid'].unique()

In [5]:
unique_qids

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

In [6]:
[nqids // i for i in [100, 10, 2, 1]]

[3, 38, 190, 381]

In [7]:
for nq in [nqids // i for i in [100, 10, 2, 1]]:
    sampled_qids = np.random.choice(unique_qids, size=nq, replace=False)
    with open(f'../build/simulation/{fold}/sim_exp_swap_query_ids_{nq}.pkl', 'wb') as f:
        pickle.dump(sampled_qids, f)

In [8]:
examination_features = pd.read_csv(f'../build/simulation/{fold}/examination_features.csv')

In [9]:
def random_sample_one_intervention(data, arms):
    control_and_treatment_data = data[data['arm'].isin(arms) & data['swapped_rank'].isin(arms)]
    observed_intervention = control_and_treatment_data[['qd_id', 'swapped_rank']].drop_duplicates()
    sampled_intervention = observed_intervention.groupby('qd_id', as_index=False, group_keys=False).apply(lambda x: x.sample(n=1))
    sampled_intervention['keep'] = 'keep'
    sampled_cases = control_and_treatment_data.merge(sampled_intervention, how='left', on=['qd_id', 'swapped_rank']).dropna(subset=['keep']).drop('keep', axis=1)
    sampled_cases['treatment'] = (sampled_cases['swapped_rank'] == 1).astype(int)
    return sampled_cases

In [10]:
for avg_clicks in [5, 10, 25, 50]:
    print(f'Loading swap clicks with {avg_clicks} sessions')
    swap_clicks = pd.read_csv(f'../build/simulation/{fold}/sim_exp_swap_train_vali_clicks_avg_clicks_{avg_clicks}.csv', low_memory=False)
    for nq in [nqids // i for i in [100, 10, 2, 1]]:
        print(f'Loading sampled qids with sample size {nq}')
        with open(f'../build/simulation/{fold}/sim_exp_swap_query_ids_{nq}.pkl', 'rb') as f:
            sampled_qids = pickle.load(f)
        print(f'Loaded {sampled_qids.shape[0]} unique queries')
        sample_clicks = swap_clicks[swap_clicks['qid'].isin(sampled_qids)]
        sample_clicks_arms = sample_clicks.loc[sample_clicks['pred_rank'] == 1, ['list_id', 'swapped_rank']].rename(columns={'swapped_rank': 'arm'})
        sample_clicks = sample_clicks.merge(sample_clicks_arms, how='left')
        print('Sampling one of two intervention pairs for each query document pair')
        randomized_ctr_list = []
        for i in range(2, 11):
            randomized_clicks = random_sample_one_intervention(sample_clicks, (1, i))
            meta_data = randomized_clicks[['partition', 'qd_id', 'true_click_probability', 'true_propensity']].drop_duplicates()
            randomized_ctr = randomized_clicks.groupby(['partition', 'qd_id', 'swapped_rank'])[['click', 'treatment']].mean().reset_index()
            randomized_ctr = randomized_ctr.merge(meta_data, how='left', on=['partition', 'qd_id'])
            randomized_ctr['treatment_group'] = i
            randomized_ctr_list.append(randomized_ctr)
        randomized_ctr = pd.concat(randomized_ctr_list, ignore_index=True)
        randomized_ctr = randomized_ctr.merge(examination_features, on=['partition', 'qd_id'], how='left')
        print('Saving random pairs')
        randomized_ctr.to_csv(f'../build/simulation/{fold}/sim_exp_swap_causal_forests_train_clicks_{avg_clicks}_{nq}.csv', index=False)

Loading swap clicks with 5 sessions
Loading sampled qids with sample size 3
Loaded 3 unique queries
Sampling one of two intervention pairs for each query document pair
Saving random pairs
Loading sampled qids with sample size 38
Loaded 38 unique queries
Sampling one of two intervention pairs for each query document pair
Saving random pairs
Loading sampled qids with sample size 190
Loaded 190 unique queries
Sampling one of two intervention pairs for each query document pair
Saving random pairs
Loading sampled qids with sample size 381
Loaded 381 unique queries
Sampling one of two intervention pairs for each query document pair
Saving random pairs
Loading swap clicks with 10 sessions
Loading sampled qids with sample size 3
Loaded 3 unique queries
Sampling one of two intervention pairs for each query document pair
Saving random pairs
Loading sampled qids with sample size 38
Loaded 38 unique queries
Sampling one of two intervention pairs for each query document pair
Saving random pairs
Loa

An existing Contextual Position Based Model

In [11]:
with open(f'../build/simulation/{fold}/examination_fc_names.pkl', 'rb') as f:
    print(f'Loading examination feature columns')
    examination_fc = pickle.load(f)

Loading examination feature columns


In [12]:
examination_fc

array(['53', '54', '63', '99', '103', '106', '108', '126', '129', '133'],
      dtype='<U3')

In [13]:
for avg_clicks in [5, 10, 25, 50]:
    print(f'Loading swap clicks with {avg_clicks} avg clicks')
    swap_clicks = pd.read_csv(f'../build/simulation/{fold}/sim_exp_swap_train_vali_clicks_avg_clicks_{avg_clicks}.csv', low_memory=False)
    swap_clicks['rank_idx'] = swap_clicks['swapped_rank'].astype(int) - 1
    for nq in [nqids // i for i in [100, 10, 2, 1]]:
        print(f'Loading sampled qids with sample size {nq}')
        with open(f'../build/simulation/{fold}/sim_exp_swap_query_ids_{nq}.pkl', 'rb') as f:
            sampled_qids = pickle.load(f)
        print(f'Loaded {sampled_qids.shape[0]} unique queries')
        # sample clicks and number rows
        sample_clicks = swap_clicks[swap_clicks['qid'].isin(sampled_qids)].copy()
        sample_clicks['click_idx'] = np.arange(sample_clicks.shape[0])
        # query document pair frequency for each rank
        doc_rank_count = sample_clicks.groupby('qd_id')['rank_idx'].value_counts().rename('doc_rank_count').reset_index()
        # unique ranks for each unique query document pair
        doc_intervention_ranks = sample_clicks[['qd_id', 'rank_idx']].drop_duplicates().rename(columns={'rank_idx': 'intervention_rank_idx'})
        doc_stats = doc_rank_count.merge(doc_intervention_ranks, on=['qd_id']).sort_values(['qd_id', 'rank_idx', 'intervention_rank_idx'])
        intervention_set = doc_stats[doc_stats['rank_idx'] != doc_stats['intervention_rank_idx']]
        # merge clicks with intervention set
        cpbm_train_click = sample_clicks.merge(intervention_set, on=['qd_id', 'rank_idx'], how='left').dropna(subset=['doc_rank_count'])
        # negative labels
        cpbm_train_click['neg_click'] = 1 - cpbm_train_click['click']
        # inverse assignment frequency weighted labels
        cpbm_train_click['inverse_frequency_weighted_pos_click'] = cpbm_train_click['click'] / cpbm_train_click['doc_rank_count']
        cpbm_train_click['inverse_frequency_weighted_neg_click'] = cpbm_train_click['neg_click'] / cpbm_train_click['doc_rank_count']
        # reindex rows
        cpbm_train_click['click_idx'] = cpbm_train_click.groupby('click_idx').ngroup()
        cpbm_train_click = cpbm_train_click.sort_values(['click_idx', 'rank_idx', 'intervention_rank_idx'])
        print(f'Saving cpbm clicks {avg_clicks}; {nq}')
        cpbm_train_click.to_csv(f'../build/simulation/{fold}/sim_exp_swap_cpbm_train_clicks_{avg_clicks}_{nq}.csv', index=False)

Loading swap clicks with 5 avg clicks
Loading sampled qids with sample size 3
Loaded 3 unique queries
Saving cpbm clicks 5; 3
Loading sampled qids with sample size 38
Loaded 38 unique queries
Saving cpbm clicks 5; 38
Loading sampled qids with sample size 190
Loaded 190 unique queries
Saving cpbm clicks 5; 190
Loading sampled qids with sample size 381
Loaded 381 unique queries
Saving cpbm clicks 5; 381
Loading swap clicks with 10 avg clicks
Loading sampled qids with sample size 3
Loaded 3 unique queries
Saving cpbm clicks 10; 3
Loading sampled qids with sample size 38
Loaded 38 unique queries
Saving cpbm clicks 10; 38
Loading sampled qids with sample size 190
Loaded 190 unique queries
Saving cpbm clicks 10; 190
Loading sampled qids with sample size 381
Loaded 381 unique queries
Saving cpbm clicks 10; 381
Loading swap clicks with 25 avg clicks
Loading sampled qids with sample size 3
Loaded 3 unique queries
Saving cpbm clicks 25; 3
Loading sampled qids with sample size 38
Loaded 38 unique