In [1]:
import gym 
import numpy as np
from tqdm import tqdm
import time
from function_approximators.replay import ReplayBuffer
import torch

from sklearn.gaussian_process.kernels import RBF, ConstantKernel, RationalQuadratic, Matern
from sklearn.gaussian_process import GaussianProcessRegressor

# env = WindyGridworldEnv()
env = gym.make("CartPole-v1")

In [2]:

def act(env, model, s, epsilon, explore):
    if explore and np.random.random_sample() < epsilon:
        action = env.action_space.sample()
    else:       
        try:
            Q = [model.predict(np.concatenate([s, actions[i]],-1).reshape(1,-1)) for i in range(env.action_space.n)]
            # print(Q)
            action = np.argmax(Q)
            # print(action)
        except:
            # print("init")
            action = env.action_space.sample()
    return action

def update(env, model, batch, gamma):
    inputs = np.concatenate([batch.states, [actions[int(i.item())] for i in batch.actions]], -1)
    preds = []
    try:
        for i in range(env.action_space.n):
            next_inputs = np.concatenate([batch.next_states, np.zeros((batch.actions.size()[0], 1)) + actions[i]], -1)
            preds.append(model.predict(next_inputs))
        preds = np.array(preds).T
        outputs = np.array(batch.rewards + gamma * (1-batch.done) * np.max(preds, 1).reshape(-1,1)).reshape(-1)
    except:
        # print("init")
        outputs = np.array(batch.rewards).reshape(-1)
        
    model.fit(inputs, outputs)
    
    # return q_loss


def play_episode(env, model, replay_buffer, batch_size, gamma, epsilon, explore, train, episode_length):
    s = env.reset()
    done = False
    episode_timesteps = 0
    episode_return = 0

    while not done:
        a = act(env, model, s, epsilon, explore=explore)
        s_next, r, done, _ = env.step(a)
        if train:
            replay_buffer.push(
                np.array(s, dtype=np.float32),
                np.array([a], dtype=np.float32),
                np.array(s_next, dtype=np.float32),
                np.array([r], dtype=np.float32),
                np.array([done], dtype=np.float32),
                )
            if len(replay_buffer) >= batch_size:
                batch = replay_buffer.sample(batch_size)
                update(env, model, batch, gamma)
        episode_timesteps += 1
        episode_return += r
        
        if episode_timesteps == episode_length:
            break
        s = s_next

    return episode_timesteps, episode_return


In [30]:
model = GaussianProcessRegressor(kernel = RBF(length_scale=0.05, length_scale_bounds="fixed"))
# model = RandomForestRegressor(n_estimators=5, max_depth=20)
# model = MLPRegressor()
replay_buffer = ReplayBuffer(1000)
# actions = [[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]]
actions =  [[1,0],[0,1]]

In [31]:
max_timesteps = 20000
timesteps_elapsed = 0
episode_length = 200
eval_freq = 1000
eval_episodes = 5
gamma = 0.99
epsilon = 1
batch_size = 512

with tqdm(total=max_timesteps) as pbar:

    while timesteps_elapsed < max_timesteps:
        episode_timesteps, _ = play_episode(env, model, replay_buffer, batch_size=batch_size, gamma=gamma, epsilon=epsilon, 
                                            explore=True, train=True, episode_length=episode_length)
        timesteps_elapsed += episode_timesteps
        pbar.update(episode_timesteps)

        if timesteps_elapsed % eval_freq < episode_timesteps:
            eval_returns = 0
            for _ in range(eval_episodes):
                _ , episode_return = play_episode(env, model, replay_buffer, batch_size=batch_size, gamma=gamma, epsilon=epsilon, 
                                                explore=False, train=False, episode_length=episode_length)
                eval_returns += episode_return / eval_episodes

            epsilon = max(epsilon*0.7,0.03)
            # tree.plot_tree(model)
            pbar.write(f"Evaluation at timestep {timesteps_elapsed} returned a mean returns of {eval_returns}")
            pbar.write(f"Epsilon = {epsilon}")


  3%|▎         | 521/20000 [00:04<02:31, 128.27it/s]


KeyboardInterrupt: 

In [32]:
batch = replay_buffer.sample(512)
obs = batch.states[0]

inputs = np.concatenate([batch.states, [actions[int(i.item())] for i in batch.actions]], -1)
outputs = np.array(batch.rewards).reshape(-1)
model.fit(inputs, outputs)

q_values = [model.predict(np.concatenate([obs, actions[i]],-1).reshape(1,-1), return_std=True) for i in range(env.action_space.n)]
print(q_values)
Q = [q[0]+2*q[1] for q in q_values]
print(Q)
action = np.argmax(Q)
print(action)


[(array([1.73210007e-08]), array([1.])), (array([1.]), array([5.77351255e-06]))]
[array([2.00000002]), array([1.00001155])]
0


In [97]:
from sklearn.metrics.pairwise import rbf_kernel


In [95]:
def act(env, epsilon, obs, X, alpha, explore):
    if (explore and np.random.random_sample() < epsilon):
        action = env.action_space.sample()
    else:        
        Q = [_predict(np.concatenate([obs, actions[i]],-1).reshape(1,-1), X, alpha) for i in range(env.action_space.n)]
        action = np.argmax(Q)
    return action    

def _predict(x, X, alpha):
    return kernel(X, x).T @ alpha

def _inc_dim_v(v):
    return np.pad(v, ((0,1),(0,0)))

def _inc_dim_m(m):
    return np.pad(m, ((0,1),(0,1)))



In [99]:
env = gym.make("CartPole-v1")
epsilon = 0.1
actions =  [[1,0],[0,1]]
learning_rate = 0.01

kernel = rbf_kernel

obs = env.reset()
action = env.action_space.sample()

sigma_0 = 1
x = np.concatenate([obs, actions[action]],-1).reshape(1,-1)
X = x
alpha = np.array([[1]])
C = np.array([[1]])
mew = 1
sigma = 1
r = -0.5
e = np.array([[1]])


for i in range(1000):

    next_obs, reward, done, _ = env.step(action)
    
    Q_values = [_predict(np.concatenate([next_obs, actions[i]],-1).reshape(1,-1), X, alpha) for i in range(env.action_space.n)]
    print(f"Q_values: {Q_values}")
    Q_max = np.max(Q_values)
    print(f"Q_max: {Q_max}")
    Q_prev = _predict(x, X, alpha).item()
    # print(f"Q_prev: {Q_prev}")
    
    x = np.concatenate([next_obs, actions[action]],-1).reshape(1,-1)
    # print(f"x: {x}")
    k = kernel(x, x)
    # print(f"k: {k}")
    kk = kernel(X, x)
    # print(f"kk: {kk}")
    K = kernel(X)
    # print(f"K: {K}")
    
    mew = kk.T @ alpha
    # print(f"mew: {mew}")
    sigma = k + kk.T @ C @ kk
    # print(f"sigma: {sigma}")
    
    r = -1/(sigma_0**2 + sigma)
    # print(f"r: {r}")
    y = learning_rate * (reward + gamma*Q_max - Q_prev) / (sigma_0**2 + sigma)
    # print(f"y: {y}")
    e = np.vstack([[0], e])
    # print(f"e: {e}")
    s = _inc_dim_v(C@kk) + e
    # print(f"s: {s}")
    C = _inc_dim_m(C) + r*(s@s.T)
    # print(f"C: {C}")
    alpha = _inc_dim_v(alpha) + y*s  
    # print(f"alpha: {alpha}")
    
    X = np.vstack([X, x])
    # print(f"X: {X}")

    obs = next_obs
    action = act(env, epsilon, obs, X, alpha, explore=True)




Q_values: [array([[0.70166577]]), array([[0.97925346]])]
Q_max: 0.9792534635547575
Q_values: [array([[0.66333759]]), array([[0.92576219]])]
Q_max: 0.9257621868167007
Q_values: [array([[0.59974264]]), array([[0.83700828]])]
Q_max: 0.8370082843016494
Q_values: [array([[0.51902347]]), array([[0.7243556]])]
Q_max: 0.7243556044660061
Q_values: [array([[0.4296068]]), array([[0.59956459]])]
Q_max: 0.5995645917142819
Q_values: [array([[0.33962808]]), array([[0.47398917]])]
Q_max: 0.473989166312608
Q_values: [array([[0.25605768]]), array([[0.35735729]])]
Q_max: 0.3573572854337468
Q_values: [array([[0.1839801]]), array([[0.25676492]])]
Q_max: 0.25676491872383506
Q_values: [array([[0.12618578]]), array([[0.17610644]])]
Q_max: 0.1761064436732574
Q_values: [array([[0.08316794]]), array([[0.11607021]])]
Q_max: 0.11607021326394025
Q_values: [array([[0.0513574]]), array([[0.07167502]])]
Q_max: 0.07167502290626614
Q_values: [array([[0.03025924]]), array([[0.04223017]])]
Q_max: 0.04223017060654178
Q_val

In [None]:
kernel