In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression

In [173]:
n_actions = 10
n_features = 20
n_data_list = [125, 250, 500, 1000, 2000, 4000, 8000, 16000, 32000, 64000]

In [174]:
Wx0 = np.random.normal(size=(n_actions, n_features))
Bx0 = np.random.normal(size=(n_actions, 1))

Wx = np.random.normal(size=(n_actions, n_features))
Bx = np.random.normal(size=(n_actions, 1))

Wxa_x = np.random.normal(size=(n_features,))
Wxa_a = np.random.normal(size=(n_actions,))

In [177]:
class Estimator():
    def __init__(self, n_data):
        self.x = np.random.normal(size=(n_data, n_features))
        self.data_generator()

    def sigmoid(self, x):
        return 1./(1.+np.exp(x))

    def feature2action0(self, x):
        prob_a = self.sigmoid(Wx0@x.T+Bx0).T
        prob_a = prob_a/prob_a.sum(axis=1, keepdims=True)
        return prob_a

    def feature2action(self, x):
        prob_a = self.sigmoid(Wx@x.T+Bx).T
        prob_a = prob_a/prob_a.sum(axis=1, keepdims=True)
        return prob_a

    def sample_actions(self, prob_a):
        data_size, action_size = prob_a.shape
        actions = np.array([np.random.choice(action_size, p=prob_a[i]) for i in range(data_size)])
        result = np.zeros_like(prob_a)
        result[np.arange(data_size), actions] = 1
        return result, actions

    def action2reward(self, x, a):
        seeds = self.sigmoid((Wxa_x@x.T+Wxa_a@a.T).T)
        return (np.random.uniform(low=0, high=1.0, size=len(seeds)) > seeds)*1.0

    def data_generator(self):
        prob_a0 = self.feature2action0(self.x)
        a0, actions0 = self.sample_actions(prob_a0)
        r0 = self.action2reward(self.x, a0)

        prob_a = self.feature2action(self.x)
        a, actions = self.sample_actions(prob_a)
        r = self.action2reward(self.x, a)

        model = LogisticRegression(multi_class='multinomial', max_iter=200, solver='lbfgs')
        model.fit(X=self.x, y=actions0)

        a_pred0 = model.predict_proba(self.x)
        a_pred0 = a_pred0 / a_pred0.sum(axis=1, keepdims=True)
    
        self.prob_a = prob_a
        self.actions0 = actions0
        self.actions = actions
        self.a_pred0 = a_pred0
        self.r0 = r0
        self.r = r
    
    def DM_estimator(self):
        r0 = self.r0
        r = self.r
        a_pred0 = self.a_pred0
        prob_a = self.prob_a

        model = LogisticRegression()
        model.fit(X=np.concatenate([self.x, a_pred0], axis=1), y=r0)

        r_pred = model.predict_proba(X=np.concatenate([self.x, prob_a], axis=1))[:,1]

        return abs(r.mean()-r_pred.mean())

    def IPS_estimator(self):
        r0 = self.r0
        r = self.r
        a_pred0 = self.a_pred0
        prob_a = self.prob_a
        actions = self.actions

        r_pred = r0 * np.array([
            prob_a_elm[actions_elm] / a_pred0_elm[actions_elm]
        for prob_a_elm, actions_elm, a_pred0_elm in zip(prob_a, actions, a_pred0)])

        return abs(r.mean()-r_pred.mean())

In [178]:
for n_data in n_data_list:
    estimator = Estimator(n_data)
    print("DM", n_data, estimator.DM_estimator())
    print("IPS", n_data, estimator.IPS_estimator())

DM 125 0.025620829056584027
IPS 125 8.71128131920488
DM 250 0.035654765569021474
IPS 250 51.36207313486178
DM 500 0.03334706585221292
IPS 500 4.785111532743227
DM 1000 0.0037378919061957783
IPS 1000 2.5595522407746802
DM 2000 0.013360177740327273
IPS 2000 2.4439093883362526
DM 4000 0.009321812920575767
IPS 4000 2.327193403892534
DM 8000 0.0049184229958944
IPS 8000 1.9427964299000937
DM 16000 0.0009611410715669733
IPS 16000 1.9241693942554767
DM 32000 0.00407535502498646
IPS 32000 2.082217939444149
DM 64000 0.003180852401700962
IPS 64000 1.979827273537869


42.66458776552922