In [1]:
import numpy as np
import pickle
from tqdm import tqdm
from sklearn import utils
from joblib import Parallel, delayed

In [2]:
nS, nA = 750, 25
gamma = 0.99

In [3]:
# mask out actions that clinicians never taken
Q_mask = np.load('action_mask.npy')

In [4]:
clinician_policy = pickle.load(open('clinician_policy.p', 'rb'))
pi_0 = np.zeros((nS, nA))
for s, probs in clinician_policy.items():
    for a, p in probs.items():
        pi_0[s,a] = p

In [5]:
pi_0[0]

array([0.3056872 , 0.        , 0.        , 0.        , 0.        ,
       0.50236967, 0.02369668, 0.        , 0.03791469, 0.02132701,
       0.09241706, 0.        , 0.        , 0.        , 0.        ,
       0.01658768, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [6]:
Q_star = np.load('qlearn_Q.npy')

In [7]:
# Calculate evaluation policy
# Soften policy
pi_e = np.zeros((nS, nA))
for s, probs in clinician_policy.items():
    A_s = list(probs.keys())
    a_star = np.nanargmax(Q_star[s])
    assert a_star in probs.keys()
    if len(probs) == 1:
        for a, _ in probs.items():
            pi_e[s,a] = 1.0
    else:
        for a, _ in probs.items():
            if a == a_star:
                pi_e[s,a] = 0.99
            else:
                pi_e[s,a] = 0.01 / (len(probs)-1)

In [8]:
pi_e[0]

array([0.00166667, 0.        , 0.        , 0.        , 0.        ,
       0.00166667, 0.00166667, 0.        , 0.00166667, 0.99      ,
       0.00166667, 0.        , 0.        , 0.        , 0.        ,
       0.00166667, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [9]:
np.isclose(pi_e.sum(axis=1), 1.0).all()

True

In [10]:
traj_te = pickle.load(open('trajD_te.pkl', 'rb'))

# Change rewards from {0, 1} to {-100, +100}
for traj in traj_te:
    traj[-1]['r'] = traj[-1]['r']*200-100

In [11]:
# Filter out unusable trajectories
# must not contain (s,a) pairs not observed in the training set
trajectories = []
for traj in traj_te:
    usable = True
    for transition in traj:
        s = transition['s']
        a = transition['a']
        if np.isclose(pi_0[s,a], 0.0):
            usable = False
            break
    if usable:
        trajectories.append(traj)

In [12]:
N = len(trajectories)
print('Effective sample size of test set', N)

Effective sample size of test set 2801


## Behavior policy value

In [13]:
# Observed test set returns
V_TEST = []
for i, traj in enumerate(trajectories):
    H = len(traj)
    G = 0
    for t in reversed(range(H)):
        G = traj[t]['r'] + gamma * G
    V_TEST.append(G)

In [14]:
np.mean(V_TEST)

73.16317503128536

In [15]:
def get_behavior_value(trajectories):
    # Observed test set returns
    V_TEST = []
    for i, traj in enumerate(trajectories):
        H = len(traj)
        G = 0
        for t in reversed(range(H)):
            G = traj[t]['r'] + gamma * G
        V_TEST.append(G)
    return np.mean(V_TEST)

In [16]:
def get_behavior_value_run_boot(trajectories, i):
    traj_boot = utils.resample(trajectories, replace=True, random_state=i)
    return get_behavior_value(traj_boot)

In [17]:
V_TEST = []
for i in tqdm(range(1000)):
    V_TEST.append(get_behavior_value_run_boot(trajectories, i))

100%|██████████| 1000/1000 [02:04<00:00,  8.00it/s]


In [18]:
np.mean(V_TEST), np.std(V_TEST)

(73.12608064951573, 0.9749614964346808)

## DR, WDR

In [19]:
# Calculate all per-step importance sampling ratio
rho_all = []
for traj in trajectories:
    rho = []
    for transition in traj:
        s = transition['s']
        a = transition['a']
        rho_t = pi_e[s,a] / pi_0[s, a]
        rho.append(rho_t)
    rho_all.append(np.array(rho))

# Find out the maximum trajectory length
max_H = max(len(traj) for traj in trajectories)

# Calculate cumulative importance ratio, rho_{1:t} for each trajectory at each timestep
rho_cum = np.zeros((N, max_H))
for i, rho in enumerate(rho_all):
    rho_tmp = np.ones(max_H)
    rho_tmp[:len(rho)] = rho
    rho_cum[i] = np.cumprod(rho_tmp)

# Calculate the average cumulative importance ratio at every horizon t
weights = rho_cum.mean(axis=0)

In [20]:
def doubly_robust_estimator(trajectory, Q, pi_0, pi_e, rho_cumulative, gamma):
    V_DR = 0
    T = len(trajectory)
    for t in range(T):
        transition = trajectory[t]
        s = transition['s']
        a = transition['a']
        r = transition['r']
        
        Q_hat = Q[s,a]
        V_hat = np.nansum(Q[s] * pi_e[s])
        assert not np.isclose(pi_0[s,a], 0.0)
        rho_1t = rho_cumulative[t]
        if t == 0:
            rho_1t_1 = 1.0
        else:
            rho_1t_1 = rho_cumulative[t-1]
        
        V_DR = V_DR + np.power(gamma, t) * (rho_1t * r - (rho_1t * Q_hat - rho_1t_1 * V_hat))
    
    return V_DR

In [21]:
def weighted_doubly_robust_estimator(trajectory, Q, pi_0, pi_e, rho_cumulative, weight_t, gamma):
    V_WDR = 0
    T = len(trajectory)
    for t in range(T):
        transition = trajectory[t]
        s = transition['s']
        a = transition['a']
        r = transition['r']
        
        Q_hat = Q[s,a]
        V_hat = np.nansum(Q[s] * pi_e[s])
        assert not np.isclose(pi_0[s,a], 0.0)
        rho_1t = rho_cumulative[t] / weight_t[t]
        if t == 0:
            rho_1t_1 = 1.0
        else:
            rho_1t_1 = rho_cumulative[t-1] / weight_t[t-1]
        
        V_WDR = V_WDR + np.power(gamma, t) * (rho_1t * r - (rho_1t * Q_hat - rho_1t_1 * V_hat))
    
    return V_WDR

## DR

In [22]:
V_DR = [
    doubly_robust_estimator(traj, Q_star, pi_0, pi_e, rho_cumulative, gamma) 
    for traj, rho_cumulative in zip(trajectories, rho_cum)
]

In [23]:
np.mean(np.clip(V_DR, -100, 100))

91.58035245184213

In [24]:
V_DR_b = []
for i in tqdm(range(1000)):
    V_DR_boot = utils.resample(V_DR, replace=True, random_state=i)
    V_DR_b.append(np.mean(np.clip(V_DR_boot, -100, 100)))

100%|██████████| 1000/1000 [00:01<00:00, 627.22it/s]


In [25]:
np.mean(V_DR_b), np.std(V_DR_b)

(91.56139534310027, 0.30995180180626747)

## WDR

In [26]:
V_WDR = [
    weighted_doubly_robust_estimator(traj, Q_star, pi_0, pi_e, rho_cumulative, weights, gamma) 
    for traj, rho_cumulative in zip(trajectories, rho_cum)
]

In [27]:
np.mean(np.clip(V_WDR, -100, 100))

92.21003694963349

In [28]:
V_WDR_b = []
for i in tqdm(range(1000)):
    V_WDR_boot = utils.resample(V_WDR, replace=True, random_state=i)
    V_WDR_b.append(np.mean(np.clip(V_WDR_boot, -100, 100)))

100%|██████████| 1000/1000 [00:01<00:00, 755.87it/s]


In [29]:
np.mean(V_WDR_b), np.std(V_WDR_b)

(92.20258930477415, 0.22843724565578843)