In [1]:
import numpy as np 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input,Dense
from tensorflow.keras.models import Model
import gym

In [2]:
from tqdm import tqdm
import os
import time 

In [3]:
epsilon = 1 
EPSILON_DECAY = 0.998
MIN_EPSILON = 0.01

In [4]:
env = gym.make('LunarLander-v2')

In [5]:
class ActionValueNetwork:
    def __init__(self, network_config):
        self.state_dim = network_config.get("state_dim")
        self.num_hidden_units = network_config.get("num_hidden_units")
        self.num_actions = network_config.get("num_actions")
        self.step_size=network_config.get('step_size')
    def create_model(self):
        i = Input(shape=self.state_dim)
        x = Dense(256, activation='relu')(i)
        x = Dense(128, activation='relu')(x)
        x = Dense(self.num_actions, activation='linear')(x)
        model = Model(i, x)
        model.compile(optimizer=Adam(lr=self.step_size),loss='mse')
        return model

In [6]:
class ReplayBuffer:
    def __init__(self, size, minibatch_size, seed):
       
        self.buffer = []
        self.minibatch_size = minibatch_size
        self.rand_generator = np.random.RandomState(seed)
        self.max_size = size

    def append(self, state, action, reward, terminal, next_state):
        
        if len(self.buffer) == self.max_size:
            del self.buffer[0]
        self.buffer.append([state, action, reward, terminal, next_state])

    def sample(self):
       
        idxs = self.rand_generator.choice(np.arange(len(self.buffer)), size=self.minibatch_size)
        return [self.buffer[idx] for idx in idxs]

    def size(self):
        return len(self.buffer)

In [7]:

class Agent:
    def __init__(self, agent_config):
       
        self.replay_buffer = ReplayBuffer(agent_config['replay_buffer_size'], 
                                          agent_config['minibatch_sz'], agent_config.get("seed"))
        self.network = ActionValueNetwork(agent_config['network_config'])
        
        self.model=self.network.create_model()
        
        self.target_model=self.network.create_model()
        
        self.num_actions = agent_config['network_config']['num_actions']
        
        self.num_replay = agent_config['num_replay_updates_per_step']
        self.discount = agent_config['gamma']
        
        self.rand_generator = np.random.RandomState(agent_config.get("seed"))
        
        self.last_state = None
        self.last_action = None
        self.epsilon = epsilon
        self.sum_rewards = 0
        self.episode_steps = 0

    
    def policy(self, state):
        
        action_values =self.model.predict(state)
        if (np.random.uniform() < self.epsilon) or (action_values.all() == 0):
            action = np.random.randint(0, env.action_space.n)
        else:
            action=np.argmax(action_values)
        return action

    def agent_start(self):
       
        
        self.sum_rewards = 0
        self.episode_steps = 0
        self.last_state = env.reset()
        self.last_state = np.reshape(self.last_state,(-1,self.last_state.shape[0]))
        self.last_action = self.policy(self.last_state)
        return self.last_action


    def agent_step(self, state,reward,terminal):
      
        
        self.sum_rewards += reward
        self.episode_steps += 1
        
        state = np.array([state])
       
     
        action = self.policy(state)
       
        
     

       
        self.replay_buffer.append(self.last_state, self.last_action, reward, terminal, state)
        
        if self.replay_buffer.size() > self.replay_buffer.minibatch_size:
            self.target_model.set_weights(self.model.get_weights())
            for _ in range(self.num_replay):
                experiences = self.replay_buffer.sample()
                self.agent_train(experiences)
        
        self.last_state = state
        self.last_action = action
      
        return action

    
    def agent_train(self,experiences):
        states, actions, rewards, terminals, next_states = map(list, zip(*experiences))
        states = np.concatenate(states)
        next_states = np.concatenate(next_states)
        rewards = np.array(rewards)
        terminals = np.array(terminals)
        batch_size1 = states.shape[0]
        q_next_mat = self.target_model.predict(next_states)
        
        v_next_vec = np.max(q_next_mat, axis=1)*(1-terminals)
        
        target_vec = rewards + self.discount*v_next_vec
       
        q_mat = self.model.predict(states)
      
        batch_indices = np.arange(q_mat.shape[0])

        X=states
        q_mat[batch_indices,actions] = target_vec
 
        self.model.fit(X,q_mat,batch_size=batch_size1,verbose=0,shuffle=False)
        
    def agent_message(self, message):
        if message == "get_sum_reward":
            return self.sum_rewards
        else:
            raise Exception("Unrecognized Message!")
    

In [8]:
agent_info = {
             'network_config': {
                 'state_dim': 8,
                 'num_actions': 4,
                 'step_size':1e-3
             },
             'replay_buffer_size': 50000,
             'minibatch_sz': 64,
             'num_replay_updates_per_step': 4,
             'gamma': 0.90,
             'seed': 0}
EPISODES = 500

In [9]:
agent=Agent(agent_info)

In [10]:
reward_episode=[]
no_episodes=[]
episode_steps=[]
eps_history=[]

In [None]:
for episode in range(0,500):
    action=agent.agent_start()
    terminal=0
    while terminal!=1:
        state,reward,terminal,info=env.step(action)
        if terminal==True:
            terminal=1
        else:
            terminal=0
        action=agent.agent_step(state,reward,terminal)
        if agent.epsilon > MIN_EPSILON:
            agent.epsilon *= EPSILON_DECAY
            agent.epsilon = max(MIN_EPSILON,agent.epsilon)
    reward = agent.agent_message('get_sum_reward')
    reward_episode.append(reward)
    no_episodes.append(episode)
    episode_steps.append(agent.episode_steps)
    eps_history.append(agent.epsilon)
    print('episode: ', episode,'score: ',reward,
            'epsilon %.2f' % agent.epsilon, 'steps', agent.episode_steps)

episode:  0 score:  -81.04317158377712 epsilon 0.86 steps 73
episode:  1 score:  -119.37627152409348 epsilon 0.72 steps 88
episode:  2 score:  -272.45360775718507 epsilon 0.59 steps 99
episode:  3 score:  -34.9486287899219 epsilon 0.47 steps 119
episode:  4 score:  -353.07805096559764 epsilon 0.37 steps 111
episode:  5 score:  -179.58176618668378 epsilon 0.30 steps 112
episode:  6 score:  -121.3694969239962 epsilon 0.25 steps 92
episode:  7 score:  -307.29626626012214 epsilon 0.20 steps 120
episode:  8 score:  -98.7651312276115 epsilon 0.13 steps 206
episode:  9 score:  -264.99455528060696 epsilon 0.08 steps 237
episode:  10 score:  -97.02594286112897 epsilon 0.01 steps 1000
episode:  11 score:  -131.83021275977427 epsilon 0.01 steps 1000
episode:  12 score:  -117.81204189009395 epsilon 0.01 steps 1000
episode:  13 score:  -139.95008580996867 epsilon 0.01 steps 1000
episode:  14 score:  -112.44464426925596 epsilon 0.01 steps 1000
episode:  15 score:  -137.94856044540978 epsilon 0.01 st

episode:  128 score:  -83.32299992098699 epsilon 0.01 steps 1000
episode:  129 score:  -124.0438830855853 epsilon 0.01 steps 1000
episode:  130 score:  -127.52112580555085 epsilon 0.01 steps 1000
episode:  131 score:  -91.7522990301931 epsilon 0.01 steps 1000
episode:  132 score:  -106.76904786668416 epsilon 0.01 steps 1000
episode:  133 score:  -122.31570683546747 epsilon 0.01 steps 1000
episode:  134 score:  -373.4513252067527 epsilon 0.01 steps 993
episode:  135 score:  -99.4675892937309 epsilon 0.01 steps 1000
episode:  136 score:  -162.1371146277923 epsilon 0.01 steps 1000
episode:  137 score:  -134.13277857224315 epsilon 0.01 steps 1000
episode:  138 score:  -395.20293591725095 epsilon 0.01 steps 864
episode:  139 score:  -426.8945612751724 epsilon 0.01 steps 147
episode:  140 score:  -139.3158786597889 epsilon 0.01 steps 1000
episode:  141 score:  -102.30278971391184 epsilon 0.01 steps 1000
episode:  142 score:  -144.69235742868574 epsilon 0.01 steps 1000
episode:  143 score:  -

episode:  254 score:  -131.68584469769996 epsilon 0.01 steps 1000
episode:  255 score:  -78.84369929197999 epsilon 0.01 steps 1000
episode:  256 score:  -148.4385792778489 epsilon 0.01 steps 1000
episode:  257 score:  -101.54206046487782 epsilon 0.01 steps 1000
episode:  258 score:  -109.62390103293421 epsilon 0.01 steps 1000
episode:  259 score:  -153.48808831659989 epsilon 0.01 steps 217
episode:  260 score:  -121.60687178261207 epsilon 0.01 steps 1000
episode:  261 score:  -108.73764058096002 epsilon 0.01 steps 1000
episode:  262 score:  -108.93711645981818 epsilon 0.01 steps 218
episode:  263 score:  -382.59692619038697 epsilon 0.01 steps 436
episode:  264 score:  -98.2119029238384 epsilon 0.01 steps 1000
episode:  265 score:  -128.83827579051768 epsilon 0.01 steps 1000
episode:  266 score:  -141.10082232293757 epsilon 0.01 steps 1000
episode:  267 score:  -95.93235187790113 epsilon 0.01 steps 1000
episode:  268 score:  -104.87308466059682 epsilon 0.01 steps 190
episode:  269 score