In [8]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
from sklearn.utils import check_random_state
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
plt.style.use('ggplot')
y_label_dict = {"se": "平均二乗誤差", "bias": "二乗バイアス", "variance": "バリアンス", "selection": "方策選択"}

from dataset import generate_synthetic_data, calc_true_value
from estimators import calc_online, calc_ips, calc_new
from utils import eps_greedy_policy, softmax_policy, aggregate_simulation_results

In [9]:
## シミュレーション設定
num_runs = 1000 # シミュレーションの繰り返し回数
dim_context = 10 # 特徴量xの次元
num_data = 500 # ログデータのサイズ
num_actions = 4 # 行動数, |A|
T = 12 # 総時点数
eps = 0.0 # データ収集方策のパラメータ, これは共通サポートの仮定を満たさない
beta = -5 # 評価方策のパラメータ
random_state = 12345
random_ = check_random_state(random_state)
num_data_list = [250, 500, 1000, 2000, 4000] # ログデータのサイズ

In [13]:
def run_simulation():
    ## 期待報酬関数を定義するためのパラメータを抽出
    random_ = check_random_state(random_state)
    theta = random_.normal(size=(dim_context, num_actions))
    M = random_.normal(size=(dim_context, num_actions))
    b = random_.normal(size=(1, num_actions))
    W = random_.uniform(0, 1, size=(T, T))
    ## データ収集方策と評価方策の真の性能(policy value)を近似
    policy_value_of_pi0, policy_value_of_pi = calc_true_value(
        dim_context=dim_context, num_actions=num_actions,
        theta=theta, M=M, b=b, W=W, T=T, beta=beta, eps=eps,
    )

    estimated_policy_value_list, selection_result_list = [], []
    for _ in tqdm(range(num_runs), desc=f"num_data={num_data}..."):
        ## データ収集方策が形成する分布に従いログデータを生成
        offline_logged_data = generate_synthetic_data(
            num_data=num_data, dim_context=dim_context, num_actions=num_actions,
            theta=theta, M=M, b=b, W=W, T=T, eps=eps, random_state=_
        )
        online_experiment_data = generate_synthetic_data(
            num_data=num_data, dim_context=dim_context, num_actions=num_actions,
            theta=theta, M=M, b=b, W=W, T=1, beta=beta, is_online=True, random_state=_
        )

        ## ログデータ上における評価方策の行動選択確率を計算
        pi = softmax_policy(beta * offline_logged_data["base_q_func"])

        ## ログデータを用いてオフ方策評価を実行する
        estimated_policy_values, selection_result = dict(), dict()
        V_hat_online, selection_result_online = calc_online(online_experiment_data)
        estimated_policy_values["online"] = V_hat_online
        selection_result["online"] = selection_result_online
        V_hat_ips, selection_result_ips = calc_ips(offline_logged_data, pi)
        estimated_policy_values["ips"] = V_hat_ips
        selection_result["ips"] = selection_result_ips
        V_hat_new, selection_result_new = calc_new(offline_logged_data, online_experiment_data, pi)
        estimated_policy_values["new"] = V_hat_new
        selection_result["new"] = selection_result_new
        estimated_policy_value_list.append(estimated_policy_values)
        selection_result_list.append(selection_result)
    return estimated_policy_value_list, selection_result_list, policy_value_of_pi, num_data

In [14]:
estimated_policy_value_list, selection_result_list, policy_value_of_pi, num_data = run_simulation()

num_data=250...: 100%|██████████| 1000/1000 [00:11<00:00, 84.50it/s]


In [24]:
estimation_result_df

Unnamed: 0,est,value
0,online,0.232115
0,ips,0.001358
0,new,0.118677
1,online,0.143445
1,ips,0.000615
...,...,...
998,ips,0.002086
998,new,0.129588
999,online,0.235560
999,ips,0.003244


In [25]:
selection_result_df

Unnamed: 0,est2,selection
0,online,False
0,ips,False
0,new,True
1,online,False
1,ips,False
...,...,...
998,ips,False
998,new,True
999,online,False
999,ips,False


In [22]:
experiment_config_name = "num_data"
experiment_config_value = num_data
"""各推定量の推定値から平均二乗誤差や二乗バイアス、バリアンスなどの実験結果を集計する."""
estimation_result_df = (
    DataFrame(estimated_policy_value_list)
    .stack()
    .reset_index(1)
    .rename(columns={"level_1": "est", 0: "value"})
)
selection_result_df = (
    DataFrame(selection_result_list)
    .stack()
    .reset_index(1)
    .rename(columns={"level_1": "est2", 0: "selection"})
)
result_df = pd.concat([estimation_result_df, selection_result_df], axis=1)
result_df[experiment_config_name] = experiment_config_value
result_df["se"] = (result_df.value - policy_value_of_pi) ** 2
result_df["bias"] = 0
result_df["variance"] = 0
result_df["true_value"] = policy_value_of_pi
sample_mean = DataFrame(result_df.groupby(["est"]).mean().value).reset_index()
# for est_ in sample_mean["est"]:
#     estimates = result_df.loc[result_df["est"] == est_, "value"].values
#     mean_estimates = sample_mean.loc[sample_mean["est"] == est_, "value"].values
#     mean_estimates = np.ones_like(estimates) * mean_estimates
#     result_df.loc[result_df["est"] == est_, "bias"] = (
#         policy_value_of_pi - mean_estimates
#     ) ** 2
#     result_df.loc[result_df["est"] == est_, "variance"] = (
#         estimates - mean_estimates
#     ) ** 2

In [23]:
result_df

Unnamed: 0,est,value,est2,selection,num_data,se,bias,variance,true_value
0,online,0.232115,online,False,250,0.018484,0,0,0.09616
0,ips,0.001358,ips,False,250,0.008988,0,0,0.09616
0,new,0.118677,new,True,250,0.000507,0,0,0.09616
1,online,0.143445,online,False,250,0.002236,0,0,0.09616
1,ips,0.000615,ips,False,250,0.009129,0,0,0.09616
...,...,...,...,...,...,...,...,...,...
998,ips,0.002086,ips,False,250,0.008850,0,0,0.09616
998,new,0.129588,new,True,250,0.001117,0,0,0.09616
999,online,0.235560,online,False,250,0.019432,0,0,0.09616
999,ips,0.003244,ips,False,250,0.008633,0,0,0.09616


In [11]:
result_df_list = []
for num_data in num_data_list:
    estimated_policy_value_list, selection_result_list, policy_value_of_pi, num_data = run_simulation()
    result_df_list.append(
        aggregate_simulation_results(
            estimated_policy_value_list, selection_result_list,
            policy_value_of_pi, "num_data", num_data,
        )
    )
result_df_data = pd.concat(result_df_list).reset_index(level=0)

num_data=250...: 100%|██████████| 1000/1000 [00:07<00:00, 131.46it/s]


[{'online': 0.23211491808567591, 'ips': 0.0013576263154699705, 'new': 0.11867650776545866}, {'online': 0.14344490065007154, 'ips': 0.0006150809300100408, 'new': 0.11454719045702068}, {'online': 0.1621270359455173, 'ips': 0.0006369218608180331, 'new': 0.17136478069427646}, {'online': 0.202556922059429, 'ips': 0.0014018535254589435, 'new': 0.19034598481414308}, {'online': 0.2642290816965516, 'ips': 0.0017924557590253286, 'new': 0.1305933347827425}, {'online': 0.23095121695519927, 'ips': 0.0017530508744428006, 'new': 0.11206355002025674}, {'online': 0.18913839183497316, 'ips': 0.001823657030706941, 'new': 0.11761798128093105}, {'online': 0.24766373886955773, 'ips': 0.0017633769528312216, 'new': 0.09182147526486346}, {'online': 0.26223090048057063, 'ips': 0.0019382596619204869, 'new': 0.14560643963460063}, {'online': 0.18424398081126003, 'ips': -0.00029603160422617304, 'new': 0.08602628059638172}, {'online': 0.2133983442176339, 'ips': 0.0007913555192463694, 'new': 0.15156882274798245}, {'o

TypeError: agg function failed [how->mean,dtype->object]