In [1]:
import numpy as np 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input,Dense
from tensorflow.keras.models import Model
import gym

In [2]:

from tqdm import tqdm
import os
import time 

In [10]:
epsilon = 0.90  
EPSILON_DECAY = 0.998
MIN_EPSILON = 0.01

In [11]:
env = gym.make('LunarLander-v2')

In [12]:
class ActionValueNetwork:
    def __init__(self, network_config):
        self.state_dim = network_config.get("state_dim")
        self.num_hidden_units = network_config.get("num_hidden_units")
        self.num_actions = network_config.get("num_actions")
        self.step_size=network_config.get('step_size')
    def create_model(self):
        i = Input(shape=self.state_dim)
        x = Dense(256, activation='relu')(i)
        x = Dense(self.num_actions, activation='linear')(x)
        model = Model(i, x)
        model.compile(optimizer=Adam(lr=self.step_size),loss='mse')
        return model

In [13]:
class ReplayBuffer:
    def __init__(self, size, minibatch_size, seed):
        
        self.buffer = []
        self.minibatch_size = minibatch_size
        self.rand_generator = np.random.RandomState(seed)
        self.max_size = size

    def append(self, state, action, reward, terminal, next_state):
       
        if len(self.buffer) == self.max_size:
            del self.buffer[0]
        self.buffer.append([state, action, reward, terminal, next_state])

    def sample(self):
        
        idxs = self.rand_generator.choice(np.arange(len(self.buffer)), size=self.minibatch_size)
        return [self.buffer[idx] for idx in idxs]

    def size(self):
        return len(self.buffer)

In [14]:

class Agent:
    def __init__(self, agent_config):
        
        self.replay_buffer = ReplayBuffer(agent_config['replay_buffer_size'], 
                                          agent_config['minibatch_sz'], agent_config.get("seed"))
        self.network = ActionValueNetwork(agent_config['network_config'])
        
        self.model=self.network.create_model()
        
        self.target_model=self.network.create_model()
        
        self.num_actions = agent_config['network_config']['num_actions']
        
        self.num_replay = agent_config['num_replay_updates_per_step']
        self.discount = agent_config['gamma']
        
        self.rand_generator = np.random.RandomState(agent_config.get("seed"))
        
        self.last_state = None
        self.last_action = None
        self.epsilon = epsilon
        self.sum_rewards = 0
        self.episode_steps = 0

    
    def policy(self, state):
        
        action_values =self.model.predict(state)
        if (np.random.uniform() < self.epsilon) or (action_values.all() == 0):
            action = np.random.randint(0, env.action_space.n)
        else:
            action=np.argmax(action_values)
        return action

    def agent_start(self):
       
        
        self.sum_rewards = 0
        self.episode_steps = 0
        self.last_state = env.reset()
        self.last_state = np.reshape(self.last_state,(-1,self.last_state.shape[0]))
        self.last_action = self.policy(self.last_state)
        return self.last_action


    def agent_step(self, state,reward,terminal):
      
        
        self.sum_rewards += reward
        self.episode_steps += 1
        
        state = np.array([state])
       
        action = self.policy(state)
       
        
      
       
        self.replay_buffer.append(self.last_state, self.last_action, reward, terminal, state)
        
        if self.replay_buffer.size() > self.replay_buffer.minibatch_size:
            self.target_model.set_weights(self.model.get_weights())
            for _ in range(self.num_replay):
                experiences = self.replay_buffer.sample()
                self.agent_train(experiences)
        
        self.last_state = state
        self.last_action = action
      
        
        return action

    
    def agent_train(self,experiences):
        states, actions, rewards, terminals, next_states = map(list, zip(*experiences))
        states = np.concatenate(states)
        next_states = np.concatenate(next_states)
        rewards = np.array(rewards)
        terminals = np.array(terminals)
        batch_size1 = states.shape[0]
        q_next_mat = self.target_model.predict(next_states)
        
        v_next_vec = np.max(q_next_mat, axis=1)*(1-terminals)
        
        target_vec = rewards + self.discount*v_next_vec
       
        q_mat = self.model.predict(states)
      
        batch_indices = np.arange(q_mat.shape[0])

        X=states
        q_mat[batch_indices,actions] = target_vec
 
        self.model.fit(X,q_mat,batch_size=batch_size1,verbose=0,shuffle=False)
        
    def agent_message(self, message):
        if message == "get_sum_reward":
            return self.sum_rewards
        else:
            raise Exception("Unrecognized Message!")
    

In [15]:
agent_info = {
             'network_config': {
                 'state_dim': 8,
                 'num_actions': 4,
                 'step_size':1e-3
             },
             'replay_buffer_size': 50000,
             'minibatch_sz': 64,
             'num_replay_updates_per_step': 4,
             'gamma': 0.99,
             'seed': 0}
EPISODES = 500

In [16]:
agent=Agent(agent_info)

In [17]:
reward_episode=[]
no_episodes=[]
episode_steps=[]
epsilon_history=[]

In [18]:
for episode in range(0, 301):
    action=agent.agent_start()
    terminal=0
    while terminal!=1:
        state,reward,terminal,info=env.step(action)
        if terminal==True:
            terminal=1
        else:
            terminal=0
        action=agent.agent_step(state,reward,terminal)
        if agent.epsilon > MIN_EPSILON:
            agent.epsilon *= EPSILON_DECAY
            agent.epsilon = max(MIN_EPSILON, agent.epsilon)
    reward = agent.agent_message('get_sum_reward')
    reward_episode.append(agent.sum_rewards)
    no_episodes.append(episode)
    episode_steps.append(agent.episode_steps)
    epsilon_history.append(agent.epsilon)
    print('episode: ', episode,'score: ', agent.sum_rewards,
            'epsilon %.2f' % agent.epsilon, 'steps', agent.episode_steps)

episode:  0 score:  -376.4650747979024 epsilon 0.74 steps 98
episode:  1 score:  -245.65637345479894 epsilon 0.61 steps 93
episode:  2 score:  -242.08280622446688 epsilon 0.51 steps 91
episode:  3 score:  -107.21888858077163 epsilon 0.40 steps 121
episode:  4 score:  -440.5047387011428 epsilon 0.28 steps 181
episode:  5 score:  -159.5893728175832 epsilon 0.21 steps 133
episode:  6 score:  -370.41653547917053 epsilon 0.12 steps 299
episode:  7 score:  -248.97037976315463 epsilon 0.08 steps 166
episode:  8 score:  -525.6762221901278 epsilon 0.03 steps 552
episode:  9 score:  -604.0056901938749 epsilon 0.02 steps 179
episode:  10 score:  -91.44277883153653 epsilon 0.02 steps 125
episode:  11 score:  -298.819831479821 epsilon 0.01 steps 224
episode:  12 score:  -403.88340207082564 epsilon 0.01 steps 296
episode:  13 score:  -278.14037387373537 epsilon 0.01 steps 166
episode:  14 score:  -285.8097967105814 epsilon 0.01 steps 327
episode:  15 score:  -366.32753519258875 epsilon 0.01 steps 17

KeyboardInterrupt: 