In [None]:
env_name = "CartPole-v0"
# env_name = "MountainCar-v0"
# env_name = "LunarLander-v3"
seed = 2
initial_training = 10
max_evaluations = 100
dqn_path = None
model_dqn_path = None


In [None]:
import numpy as np

def softmax(q, temp=1.0):
    q /= temp
    return np.exp(q - np.max(q)) / np.sum(np.exp(q - np.max(q)))

class AccumulateErrorEnsemble:
    def __init__(self, models, gamma=0.99, decay=1.0, temp=1.0):
        self.models = models
        self.gamma = gamma
        self.decay = decay
        self.temp = temp
        self.cumulative_errors = np.zeros(len(models))
        self.prev_qs = None

    def _update_prev_qs(self, obs):
        if self.prev_qs is None:
            self.prev_qs = []
            for model in self.models:
                self.prev_qs.append(model.infer(obs)[0])

    def _get_weights(self):
        return softmax(-self.cumulative_errors, self.temp)

    def act(self, obs):
        raise NotImplementedError

    def observe(self, action, obs, reward, done):
        if done:
            self.cumulative_errors *= 0.0
            self.prev_qs = None
            return
        self.cumulative_errors *= self.decay
        next_qs = []
        for i, model in enumerate(self.models):
            next_q = model.infer(obs)[0]
            next_qs.append(next_q)
            q = self.prev_qs[i][action]
            error = reward + self.gamma * np.max(next_q) - q
            self.cumulative_errors[i] += error ** 2
        self.prev_qs = next_qs


class TDWAverageEnsemble(AccumulateErrorEnsemble):
    def __init__(self, models, gamma=0.99, decay=1.0, temp=1.0, visualizer=None):
        self.visualizer = visualizer
        super().__init__(models, gamma, decay, temp)

    def act(self, obs):
        self._update_prev_qs(obs)

        weights = self._get_weights()

        if self.visualizer is not None:
            self.visualizer.update(weights)

        weighted_q = np.reshape(weights, [-1, 1]) * np.array(self.prev_qs)
        q = np.sum(weighted_q, axis=0)
        action = np.argmax(q)
        self.prev_action = action
        return action


class TDWVoteEnsemble(AccumulateErrorEnsemble):
    def __init__(self, models, gamma=0.99, decay=1.0, temp=1.0, visualizer=None):
        self.visualizer = visualizer
        super().__init__(models, gamma, decay, temp)

    def act(self, obs):
        self._update_prev_qs(obs)
        # check self.prev_qs

        weights = self._get_weights()
        print("weights:", weights)

        if self.visualizer is not None:
            self.visualizer.update(weights)

        votes = np.zeros(len(self.prev_qs[0]), dtype=np.float32)
        # for w, q in zip(weights, self.prev_qs):
        #     votes[np.argmax(q)] += w

        agent_contributions = np.zeros(len(self.prev_qs), dtype=np.float32)
        for i, (w, q) in enumerate(zip(weights, self.prev_qs)):
            action_index = np.argmax(q)
            votes[action_index] += w
            agent_contributions[i] += w if action_index == np.argmax(votes) else 0

        action = np.argmax(votes)
        most_contributing_agent = np.argmax(agent_contributions)

        self.prev_action = action
        return action, most_contributing_agent

In [None]:
import numpy as np
import random
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

class Agent:
    def __init__(self,env):
        pass

    def act(self, state):
        pass

    def update_model(self, state, action, reward, new_state, done):
        pass

class DQN_Agent(Agent):
    def __init__(self, env, name="DQN_Agent"):
        self.name = name
        self.env = env
        self.replay_memory = deque(maxlen=200000)

        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.learning_rate = 0.05
        self.target_update_counter = 0
        self.C = 8 # intervcal for updating target network
        self.initial_random_steps = 0
        self.actions_count = 0
        self.clip_errors = True

        self.q_network = self.init_q_network()
        self.target_q_network = self.init_q_network()

    def on_episode_start(self):
        pass

    def on_episode_end(self):
        pass

    def get_observation_space(self):
        return self.env.observation_space

    def get_action_space(self):
        return self.env.action_space

    def init_q_network(self):
        model = Sequential()
        state_shape = self.get_observation_space().shape
        model.add(Dense(48, input_shape=state_shape, activation="relu"))
        #model.add(Dense(48, activation="relu"))
        #model.add(Dense(24, activation="relu"))
        model.add(Dense(self.get_action_space().n, activation='linear'))
        model.compile(loss="mean_squared_error", optimizer=Adam(learning_rate=self.learning_rate))
        return model



    def act(self, state):
        self.actions_count += 1
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon or self.actions_count < self.initial_random_steps:
            return self.get_action_space().sample()
        return np.argmax(self.q_network.predict(state)[0])

    def infer(self, obs_t):
        if isinstance(obs_t, np.ndarray) and obs_t.ndim > 3:
            return self.q_network.predict(obs_t)
        obs_t = np.array(obs_t).reshape(1, -1)
        q_values = self.q_network.predict(obs_t) # Perform inference using the Q-network
        return q_values


    def update_model(self, state, action, reward, new_state, done):
        self.replay_memory.append([state, action, reward, new_state, done])
        self.fit_q_network()
        self.update_target_q_network()

    def sample_replays(self,batch_size):
        return random.sample(self.replay_memory, batch_size)

    def fit_q_network(self):
        #sample replay and do SGD
        batch_size = 16
        if len(self.replay_memory) < batch_size:
            return

        samples = self.sample_replays(batch_size)
        sampled_states = []
        sampled_targets = []
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_q_network.predict(state)
            predicted = self.q_network.predict(state)
            if done:
                target[0][action] = reward
            else:
                #update target by Bellman equation
                target[0][action] = reward + self.gamma * max(self.target_q_network.predict(new_state)[0])

                if self.clip_errors:
                    #clip error to -1, +1
                    if (target[0][action] > predicted[0][action]):
                        target[0][action] = predicted[0][action] + 1
                    elif (target[0][action] > predicted[0][action]):
                        target[0][action] = predicted[0][action] - 1
            sampled_states.append(state)
            sampled_targets.append(target)

        batched_states = np.concatenate(sampled_states,axis=0)
        batched_targets = np.concatenate(sampled_targets,axis=0)
        self.q_network.fit(batched_states, batched_targets, epochs=1, verbose=0)

    def update_target_q_network(self):
        #update target q network every C steps
        self.target_update_counter += 1
        if (self.target_update_counter > self.C):
            self.target_update_counter = 0
            self.target_q_network.set_weights(self.q_network.get_weights())

In [None]:
import numpy as np
import random
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from scipy import stats

class DQN_Guided_Exploration(DQN_Agent):
    def __init__(self, env, name="DQN_Guided_Exploration", gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.9995, learning_rate=0.05):
        self.name = name
        self.env = env
        self.replay_memory = deque(maxlen=200000)

        #Mountain Car
        #explore sample = 50
        #qnetwork = 1 hiddenlayer 48 units
        #convergence cutoff 0.0003
        #dynamics network lr = 0.02
        #dynamics network batchsize =64
        #scatter plot 2000 sample
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.learning_rate = 0.05
        self.target_update_counter = 0
        self.C = 8 # intervcal for updating target network
        self.initial_random_steps = 0
        self.actions_count = 0
        self.clip_errors = True

        '''#Lunar
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.learning_rate = 0.05
        self.target_update_counter = 0
        self.C = 8 # intervcal for updating target network
        self.initial_random_steps = 5000
        self.actions_count = 0
        self.clip_errors = True'''

        self.q_network = self.init_q_network()
        self.target_q_network = self.init_q_network()
        self.dynamics_model = self.init_dynamics_model()
        self.update_count = 0
        self.dynamics_model_converged = False

    def update_model(self, state, action, reward, new_state, done):

        self.replay_memory.append([state, action, reward, new_state, done])
        self.fit_q_network()
        self.update_target_q_network()
        self.update_count += 1

        if self.update_count % 25 == 0:
            self.fit_dynamics_model()
        if self.update_count % 500 == 0:
            self.eval_dynamics_model()

    def act(self, state):
        self.actions_count += 1
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon or self.actions_count < self.initial_random_steps:
            return self.explore(state)
        return np.argmax(self.q_network.predict(state)[0])

    def explore(self,state):
        if not self.dynamics_model_converged:
            return self.get_action_space().sample()
        #return self.get_action_space().sample()
        N = len(self.replay_memory)
        num_samples = 50
        samples = []
        for i in range(N-num_samples,N):
           samples.append(self.replay_memory[i][0])

        least_p = np.inf
        best_a = -1
        for action in range(self.get_action_space().n):
            next_state = self.dynamics_model.predict(np.append(state, [[action]], axis=1))
            p = self.get_probability(next_state, samples)
            if p < least_p:
                best_a = action
                least_p = p
        return best_a

    def get_probability(self,state, samples):
        design = []
        for s in samples:
            design.append(s[0])
        design = np.stack(design).T
        cov = np.cov(design)
        mean = np.mean(design,axis = 1)
        p = stats.multivariate_normal.pdf(state[0],mean,cov)
        return p

    def init_dynamics_model(self):
        model = Sequential()
        state_shape = (self.get_observation_space().shape[0] + 1,)
        print(state_shape)
        model.add(Dense(24, input_shape=state_shape, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(self.get_observation_space().shape[0], activation='linear'))
        model.compile(loss="mean_squared_error", optimizer=Adam(learning_rate=0.02))
        return model

    def fit_dynamics_model(self):
        batchsize = 64
        if len(self.replay_memory) < batchsize:
            return
        samples = self.sample_replays(batchsize)
        sampled_states = []
        sampled_targets = []
        for sample in samples:
            state, action, reward, new_state, done = sample
            input_state = np.append(state, [[action]], axis=1)
            target = new_state
            sampled_states.append(input_state)
            sampled_targets.append(target)

        batched_inputs = np.concatenate(sampled_states, axis=0)
        batched_targets = np.concatenate(sampled_targets, axis=0)
        self.dynamics_model.fit(batched_inputs, batched_targets, epochs=1, verbose=0)

    #debug use only
    def eval_dynamics_model(self):
        samples = self.sample_replays(32)
        sampled_states = []
        sampled_targets = []
        for sample in samples:
            state, action, reward, new_state, done = sample
            input_state = np.append(state, [[action]], axis=1)
            target = new_state
            sampled_states.append(input_state)
            sampled_targets.append(target)

        batched_inputs = np.concatenate(sampled_states, axis=0)
        batched_targets = np.concatenate(sampled_targets, axis=0)
        scores = self.dynamics_model.evaluate(batched_inputs,batched_targets,verbose=0)
        if scores < 0.005:
            self.dynamics_model_converged = True
            print('Dynamics model has converged!')
        print(self.dynamics_model.metrics_names, scores)

In [None]:
import pandas as pd

def save_actor_distribution(agent_distribution, path=None):
    df = pd.DataFrame.from_dict(agent_distribution, orient='index', columns=['count'])
    df.index.name = 'actor'
    if path is not None:
        df.to_csv(path, mode='a', header=False)

def save_rewards_and_length(rewards, path=None):
    df = pd.DataFrame(rewards)
    df.to_csv(path, mode='a', header=False)

In [None]:
import gymnasium as gym
import ale_py
import numpy as np
import time
import pickle

gym.register_envs(ale_py)
env = gym.make(env_name)
env.reset(seed=seed)
# env = wrappers.Monitor(env, 'replay', video_callable=lambda e: e%record_video_every == 0,force=True)


state_shape = (1,env.observation_space.shape[0])
agents = []

def train_agent(agent, env, max_episodes, state_shape):
    """ Train a single agent on the given environment"""
    start_time = time.time()
    total_reward_list = []
    episode_length_list = []
    for episode in range(max_episodes):
        agent.on_episode_start()
        state, _ = env.reset()
        cur_state = state.reshape(state_shape)
        steps = 0
        total_reward = 0
        done = False
        while not done:
            steps += 1
            action = agent.act(cur_state)
            new_state, reward, done, truncated, _ = env.step(action)
            if truncated:
                done = True
            new_state = new_state.reshape(state_shape)
            agent.update_model(cur_state, action, reward, new_state, done)
            cur_state = new_state
            total_reward += reward
            if done:
                break

        agent.on_episode_end()
        total_reward_list.append(total_reward)
        episode_length_list.append(steps)
        print('episode {} steps: {}, total reward: {},  elapsed time: {}s'.format(episode, steps, total_reward, int(time.time()-start_time)))

def ensemble_training(env, method, epsilon, rng):
    obs, _ = env.reset(seed=seed)
    cumulative_reward = 0.0
    agent_distribution = {agent.name: 0 for agent in agents}
    cur_state = obs.reshape(state_shape)
    terminated = False
    while not terminated:
        action, actor_index = method.act(obs)
        if rng.rand() < epsilon:
            action = rng.randint(env.action_space.n)
        obs, reward, terminated, truncated, info = env.step(action)
        if truncated:
            terminated = True
        agent_distribution[agents[actor_index].name] += 1
        clipped_reward = np.clip(reward, -1.0, 1.0)
        new_state = obs.reshape(state_shape)
        agents[actor_index].update_model(cur_state, action, reward, new_state, terminated)

        # Update both agents
        # agents[0].update_model(cur_state, action, reward, new_state, terminated)
        # agents[1].update_model(cur_state, action, reward, new_state, terminated)

        cur_state = new_state
        method.observe(action, obs, clipped_reward, terminated)
        cumulative_reward += reward

    save_actor_distribution(agent_distribution, f'actor_distribution_{seed}.csv')
    return cumulative_reward


def pixel_to_float(obs):
    return np.array(obs, dtype=np.float32) / 255.0

def atari_evaluation(env, method, epsilon, rng):
    pass


if dqn_path is not None:
    with open(dqn_path, 'rb') as f:
        agents.append(pickle.load(f))
else:
    agents.append(DQN_Agent(env=env, name='DQN_Agent'))
    train_agent(agents[0], env, initial_training, state_shape)

if model_dqn_path is not None:
    with open(model_dqn_path, 'rb') as f:
        agents.append(pickle.load(f))
else:
    agents.append(DQN_Guided_Exploration(env=env, name='DQN_Guided_Exploration'))
    train_agent(agents[1], env, initial_training, state_shape)

method = TDWVoteEnsemble(agents)

# train with ensemble
for i in range(max_evaluations):
    if env_name.startswith("ALE/"):
        reward = atari_evaluation(env, method, epsilon=0.05, rng=np.random.RandomState(0))
    else:
        reward = ensemble_training(env, method, epsilon=0.05, rng=np.random.RandomState(0))
    save_rewards_and_length([reward], f'tdw_rewards_{seed}.csv')