In [1]:
import gym 
import numpy as np
from tqdm import tqdm
import time
from sklearn.tree import DecisionTreeRegressor
from function_approximators.replay import ReplayBuffer
import torch
from sklearn.utils.validation import check_is_fitted
from sklearn import tree
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

# env = WindyGridworldEnv()
env = gym.make("CartPole-v1")

In [2]:

def act(env, model, s, epsilon, explore):
    if explore and np.random.random_sample() < epsilon:
        action = env.action_space.sample()
    else:       
        try:
            Q = [model.predict(np.concatenate([s, actions[i]],-1).reshape(1,-1)) for i in range(env.action_space.n)]
            # print(Q)
            action = np.argmax(Q)
            # print(action)
        except:
            # print("init")
            action = env.action_space.sample()
    return action

def update(env, model, batch, gamma):
    inputs = np.concatenate([batch.states, [actions[int(i.item())] for i in batch.actions]], -1)
    preds = []
    try:
        for i in range(env.action_space.n):
            next_inputs = np.concatenate([batch.next_states, np.zeros((batch.actions.size()[0], 1)) + actions[i]], -1)
            preds.append(model.predict(next_inputs))
        preds = np.array(preds).T
        outputs = np.array(batch.rewards + gamma * (1-batch.done) * np.max(preds, 1).reshape(-1,1)).reshape(-1)
    except:
        # print("init")
        outputs = np.array(batch.rewards).reshape(-1)
        
    model.fit(inputs, outputs)
    
    # return q_loss


def play_episode(env, model, replay_buffer, batch_size, gamma, epsilon, explore, train, episode_length):
    s = env.reset()
    done = False
    episode_timesteps = 0
    episode_return = 0

    while not done:
        a = act(env, model, s, epsilon, explore=explore)
        s_next, r, done, _ = env.step(a)
        if train:
            replay_buffer.push(
                np.array(s, dtype=np.float32),
                np.array([a], dtype=np.float32),
                np.array(s_next, dtype=np.float32),
                np.array([r], dtype=np.float32),
                np.array([done], dtype=np.float32),
                )
            if len(replay_buffer) >= batch_size:
                batch = replay_buffer.sample(batch_size)
                update(env, model, batch, gamma)
        episode_timesteps += 1
        episode_return += r
        
        if episode_timesteps == episode_length:
            break
        s = s_next

    return episode_timesteps, episode_return


In [3]:
model = DecisionTreeRegressor(max_depth=5, min_samples_split=5, min_samples_leaf=5)
# model = RandomForestRegressor(n_estimators=5, max_depth=20)
# model = MLPRegressor()
replay_buffer = ReplayBuffer(1000)
# actions = [[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]]
actions =  [[1,0],[0,1]]

In [4]:
max_timesteps = 20000
timesteps_elapsed = 0
episode_length = 200
eval_freq = 1000
eval_episodes = 5
gamma = 0.99
epsilon = 1
batch_size = 512

with tqdm(total=max_timesteps) as pbar:

    while timesteps_elapsed < max_timesteps:
        episode_timesteps, _ = play_episode(env, model, replay_buffer, batch_size=batch_size, gamma=gamma, epsilon=epsilon, 
                                            explore=True, train=True, episode_length=episode_length)
        timesteps_elapsed += episode_timesteps
        pbar.update(episode_timesteps)

        if timesteps_elapsed % eval_freq < episode_timesteps:
            eval_returns = 0
            for _ in range(eval_episodes):
                _ , episode_return = play_episode(env, model, replay_buffer, batch_size=batch_size, gamma=gamma, epsilon=epsilon, 
                                                explore=False, train=False, episode_length=episode_length)
                eval_returns += episode_return / eval_episodes

            epsilon = max(epsilon*0.7,0.03)
            # tree.plot_tree(model)
            pbar.write(f"Evaluation at timestep {timesteps_elapsed} returned a mean returns of {eval_returns}")
            pbar.write(f"Epsilon = {epsilon}")


  5%|▍         | 992/20000 [00:08<02:33, 123.47it/s]


IndexError: index 1000 is out of bounds for axis 0 with size 1000

In [5]:
import scipy.spatial.distance as dist
n = replay_buffer.memory.states.shape[0]

a = np.concatenate((replay_buffer.memory[0][0], replay_buffer.memory[1][0], replay_buffer.memory[2][0]))
print(a)

distances = []
for i in range(n):
    b = np.concatenate((replay_buffer.memory[0][i], replay_buffer.memory[1][i], replay_buffer.memory[2][i]))
    distances.append(dist.euclidean(a,b))

[(i,d) for (i,d) in enumerate(distances) if d < 0.1]

print(np.concatenate((replay_buffer.memory[0][2], replay_buffer.memory[1][2], replay_buffer.memory[2][2])))

distances

min(distances)

[-0.08800144 -0.8159531   0.07608863  1.172605    1.         -0.10432051
 -0.62189823  0.09954073  0.9047134 ]
[ 0.00810471  0.01968517  0.00894894  0.0262062   0.          0.00849841
 -0.17556398  0.00947307  0.32169914]


0.0

In [22]:
b = np.concatenate((replay_buffer.memory.states, replay_buffer.memory.actions), axis=1)
a = np.concatenate((replay_buffer.memory[0][0], replay_buffer.memory[1][0])).reshape(1,5)

c = dist.cdist(a,b,'euclidean')
np.min(c)

0.0

In [6]:
# model1 = DecisionTreeRegressor()

batch1 = replay_buffer.sample(32)
batch1.rewards[3] += 1

# inputs1 = np.concatenate([batch1.states, [actions[int(i.item())] for i in batch1.actions]], -1)
# outputs1 = batch1.rewards

inputs1 = np.concatenate([batch1.states, [actions[int(i.item())] for i in batch1.actions]], -1)
preds1 =[]
for i in range(env.action_space.n):
    next_inputs1 = np.concatenate([batch1.next_states, np.zeros((batch1.actions.size()[0], 1))+actions[i]], -1)
    preds1.append(model1.predict(next_inputs1))
preds1 = np.array(preds1).T
outputs1 = batch1.rewards + gamma * (1-batch1.done) * np.max(preds1, 1).reshape(-1,1)
print(outputs1)
tree.plot_tree(model1)

model1.fit(inputs1, outputs1)

ValueError: low >= high

In [22]:
batch = replay_buffer.sample(32)
np.concatenate([batch.next_states, np.zeros((batch.actions.size()[0], 1)) + actions[0]], -1).shape

preds = []
for i in range(env.action_space.n):
    next_inputs = np.concatenate([batch.next_states, np.zeros((batch.actions.size()[0], 1)) + actions[i]], -1)
    preds.append(model.predict(next_inputs))
preds = np.array(preds).T
outputs = np.array(batch.rewards + gamma * (1-batch.done) * np.max(preds, 1).reshape(-1,1)).reshape(-1)

In [23]:
outputs.shape

(32,)

In [28]:
np.array(batch.rewards).reshape(-1).shape

(32,)