In [1]:
import numpy as np 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input,Dense
from tensorflow.keras.models import Model
import gym

In [2]:

from tqdm import tqdm
import os
import time 

In [3]:
epsilon = 1  
EPSILON_DECAY = 0.998 
MIN_EPSILON = 0.01

In [4]:
env = gym.make('LunarLander-v2')

In [5]:
class ActionValueNetwork:
    def __init__(self, network_config):
        self.state_dim = network_config.get("state_dim")
        self.num_hidden_units = network_config.get("num_hidden_units")
        self.num_actions = network_config.get("num_actions")
        self.step_size=network_config.get('step_size')
    def create_model(self):
        i = Input(shape=self.state_dim)
        x = Dense(256, activation='relu')(i)
        x = Dense(128, activation='relu')(x)
        x = Dense(self.num_actions, activation='linear')(x)
        model = Model(i, x)
        model.compile(optimizer=Adam(lr=self.step_size),loss='mse')
        return model

In [6]:
class ReplayBuffer:
    def __init__(self, size, minibatch_size, seed):

        self.buffer = []
        self.minibatch_size = minibatch_size
        self.rand_generator = np.random.RandomState(seed)
        self.max_size = size

    def append(self, state, action, reward, terminal, next_state):
        
        if len(self.buffer) == self.max_size:
            del self.buffer[0]
        self.buffer.append([state, action, reward, terminal, next_state])

    def sample(self):
       
        idxs = self.rand_generator.choice(np.arange(len(self.buffer)), size=self.minibatch_size)
        return [self.buffer[idx] for idx in idxs]

    def size(self):
        return len(self.buffer)

In [7]:

class Agent:
    def __init__(self, agent_config):
       
        self.replay_buffer = ReplayBuffer(agent_config['replay_buffer_size'], 
                                          agent_config['minibatch_sz'], agent_config.get("seed"))
        self.network = ActionValueNetwork(agent_config['network_config'])
        
        self.model=self.network.create_model()
        
        self.target_model=self.network.create_model()
        
        self.num_actions = agent_config['network_config']['num_actions']
        
        self.num_replay = agent_config['num_replay_updates_per_step']
        self.discount = agent_config['gamma']
        
        self.rand_generator = np.random.RandomState(agent_config.get("seed"))
        
        self.last_state = None
        self.last_action = None
        self.epsilon = epsilon
        self.sum_rewards = 0
        self.episode_steps = 0

    
    def policy(self, state):
     
        action_values =self.model.predict(state)
        if (np.random.uniform() < self.epsilon) or (action_values.all() == 0):
            action = np.random.randint(0, env.action_space.n)
        else:
            action=np.argmax(action_values)
        return action

    
    def agent_start(self):
       
        
        self.sum_rewards = 0
        self.episode_steps = 0
        self.last_state = env.reset()
        self.last_state = np.reshape(self.last_state,(-1,self.last_state.shape[0]))
        self.last_action = self.policy(self.last_state)
        return self.last_action


    def agent_step(self, state,reward,terminal):
      
        
        self.sum_rewards += reward
        self.episode_steps += 1
        
        state = np.array([state])
       
     
        action = self.policy(state)
       
        
        

       
        self.replay_buffer.append(self.last_state, self.last_action, reward, terminal, state)
       
        if self.replay_buffer.size() > self.replay_buffer.minibatch_size:
            self.target_model.set_weights(self.model.get_weights())
            for _ in range(self.num_replay):
                experiences = self.replay_buffer.sample()
                self.agent_train(experiences)
        
        self.last_state = state
        self.last_action = action
       
        
        return action

    
    def agent_train(self,experiences):
        states, actions, rewards, terminals, next_states = map(list, zip(*experiences))
        states = np.concatenate(states)
        next_states = np.concatenate(next_states)
        rewards = np.array(rewards)
        terminals = np.array(terminals)
        batch_size1 = states.shape[0]
        q_next_mat = self.target_model.predict(next_states)
        
        v_next_vec = np.max(q_next_mat, axis=1)*(1-terminals)
        
        target_vec = rewards + self.discount*v_next_vec
       
        q_mat = self.model.predict(states)
      
        batch_indices = np.arange(q_mat.shape[0])

        X=states
        q_mat[batch_indices,actions] = target_vec
 
        self.model.fit(X,q_mat,batch_size=batch_size1,verbose=0,shuffle=False)
        
    def agent_message(self, message):
        if message == "get_sum_reward":
            return self.sum_rewards
        else:
            raise Exception("Unrecognized Message!")
    

In [8]:
agent_info = {
             'network_config': {
                 'state_dim': 8,
                 'num_actions': 4,
                 'step_size':1e-3
             },
             'replay_buffer_size': 50000,
             'minibatch_sz': 64,
             'num_replay_updates_per_step': 4,
             'gamma': 0.95,
             'seed': 0}
EPISODES = 500

In [9]:
agent=Agent(agent_info)

In [10]:
reward_episode=[]
no_episodes=[]
episode_steps=[]
eps_history=[]

In [11]:
for episode in range(0,500):
    action=agent.agent_start()
    terminal=0
    while terminal!=1:
        state,reward,terminal,info=env.step(action)
        if terminal==True:
            terminal=1
        else:
            terminal=0
        action=agent.agent_step(state,reward,terminal)
        if agent.epsilon > MIN_EPSILON:
            agent.epsilon *= EPSILON_DECAY
            agent.epsilon = max(MIN_EPSILON,agent.epsilon)
    reward = agent.agent_message('get_sum_reward')
    reward_episode.append(reward)
    no_episodes.append(episode)
    episode_steps.append(agent.episode_steps)
    eps_history.append(agent.epsilon)
    print('episode: ', episode,'score: ',reward,
            'epsilon %.2f' % agent.epsilon, 'steps', agent.episode_steps)

episode:  0 score:  -115.17095984196763 epsilon 0.83 steps 91
episode:  1 score:  -94.78368746910266 epsilon 0.70 steps 86
episode:  2 score:  -271.6819623466136 epsilon 0.60 steps 82
episode:  3 score:  -44.363561492489836 epsilon 0.46 steps 133
episode:  4 score:  -140.9167389738966 epsilon 0.37 steps 106
episode:  5 score:  -270.37136693209436 epsilon 0.30 steps 106
episode:  6 score:  13.416376741490794 epsilon 0.22 steps 157
episode:  7 score:  -96.17115855063275 epsilon 0.19 steps 60
episode:  8 score:  11.958474636660242 epsilon 0.16 steps 93
episode:  9 score:  -505.7816597670661 epsilon 0.14 steps 81
episode:  10 score:  -471.3396999477431 epsilon 0.11 steps 127
episode:  11 score:  -147.31030553000284 epsilon 0.08 steps 141
episode:  12 score:  -167.71087189354458 epsilon 0.06 steps 166
episode:  13 score:  -146.8974771557461 epsilon 0.05 steps 71
episode:  14 score:  -173.39529418595123 epsilon 0.04 steps 102
episode:  15 score:  -175.19281870932997 epsilon 0.03 steps 204
ep

episode:  128 score:  -4.355837832270507 epsilon 0.01 steps 1000
episode:  129 score:  -73.48774262077458 epsilon 0.01 steps 1000
episode:  130 score:  -59.57773192848421 epsilon 0.01 steps 1000
episode:  131 score:  -29.42821166543853 epsilon 0.01 steps 1000
episode:  132 score:  -8.486920000578385 epsilon 0.01 steps 1000
episode:  133 score:  -31.341419690203043 epsilon 0.01 steps 1000
episode:  134 score:  -9.705483691007451 epsilon 0.01 steps 1000
episode:  135 score:  -92.9857168081292 epsilon 0.01 steps 1000
episode:  136 score:  -62.46891573969598 epsilon 0.01 steps 1000
episode:  137 score:  -112.11220033255901 epsilon 0.01 steps 1000
episode:  138 score:  -67.54577566499393 epsilon 0.01 steps 1000
episode:  139 score:  -25.28226456638182 epsilon 0.01 steps 1000
episode:  140 score:  -56.59300658526132 epsilon 0.01 steps 1000
episode:  141 score:  -66.56057385697572 epsilon 0.01 steps 1000
episode:  142 score:  -33.976579456698985 epsilon 0.01 steps 1000
episode:  143 score:  -

episode:  255 score:  -126.60780945787616 epsilon 0.01 steps 1000
episode:  256 score:  -68.9734468328711 epsilon 0.01 steps 1000
episode:  257 score:  -22.687267841568875 epsilon 0.01 steps 1000
episode:  258 score:  -186.94515392497345 epsilon 0.01 steps 825
episode:  259 score:  -11.160244719291773 epsilon 0.01 steps 1000
episode:  260 score:  -125.87464276702424 epsilon 0.01 steps 1000
episode:  261 score:  -87.95091020088252 epsilon 0.01 steps 1000
episode:  262 score:  -112.905239250243 epsilon 0.01 steps 1000
episode:  263 score:  -87.16999191230859 epsilon 0.01 steps 1000
episode:  264 score:  -52.88632365069614 epsilon 0.01 steps 1000
episode:  265 score:  -118.37844387522982 epsilon 0.01 steps 1000
episode:  266 score:  -86.02556002797198 epsilon 0.01 steps 1000
episode:  267 score:  -101.98531959347866 epsilon 0.01 steps 1000
episode:  268 score:  -132.1208501184523 epsilon 0.01 steps 743
episode:  269 score:  -44.40616671113344 epsilon 0.01 steps 1000
episode:  270 score:  

Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x000002637B55C168>
Traceback (most recent call last):
  File "C:\ProgramData\Miniconda3\envs\tensorflow\lib\weakref.py", line 359, in remove
    self = selfref()
KeyboardInterrupt: 
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\ProgramData\Miniconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-2ce8589acebc>", line 10, in <module>
    action=agent.agent_step(state,reward,terminal)
  File "<ipython-input-7-f0a14aa2cbe8>", line 74, in agent_step
    action = self.policy(state)
  File "<ipython-input-7-f0a14aa2cbe8>", line 46, in policy
    action_values =self.model.predict(state)
  File "C:\ProgramData\Miniconda3\envs\tensorflow\lib\site-packages\tensorflow_core\python\keras\engine\training.py", line 1013, in predict
    use_multiprocessing=use_multiprocessing)
  File "C:\ProgramData\Miniconda3\envs\tensorflow\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py", line 498, in predict
    workers=workers, use_multiprocessing=use_multiprocessing, **kwargs)
  File "C:\ProgramData\Miniconda3\envs\tensorflow\lib\site-packages\ten

TypeError: object of type 'NoneType' has no len()

In [14]:

np.mean(episode_steps)

840.5940959409594

In [15]:
episode_steps

[91,
 86,
 82,
 133,
 106,
 106,
 157,
 60,
 93,
 81,
 127,
 141,
 166,
 71,
 102,
 204,
 206,
 132,
 563,
 1000,
 1000,
 1000,
 941,
 1000,
 1000,
 278,
 627,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 81,
 1000,
 1000,
 1000,
 1000,
 1000,
 113,
 1000,
 1000,
 1000,
 1000,
 1000,
 274,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 858,
 1000,
 273,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 116,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 148,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 

In [16]:
reward_episode

[-115.17095984196763,
 -94.78368746910266,
 -271.6819623466136,
 -44.363561492489836,
 -140.9167389738966,
 -270.37136693209436,
 13.416376741490794,
 -96.17115855063275,
 11.958474636660242,
 -505.7816597670661,
 -471.3396999477431,
 -147.31030553000284,
 -167.71087189354458,
 -146.8974771557461,
 -173.39529418595123,
 -175.19281870932997,
 -114.28528186955161,
 -31.87208890419734,
 -149.57195642219034,
 -91.87271411378522,
 -156.5597962027561,
 -88.5205424594497,
 -515.1322744500776,
 -190.44588694374966,
 -73.82752554699451,
 -56.873707463841676,
 -190.15513904697772,
 -122.69529526037017,
 -75.44828332553858,
 -63.57068200864758,
 -110.51693021433941,
 -48.06230828928637,
 -43.33477619054013,
 -51.158677202506524,
 -115.23811558064642,
 -25.75519711984166,
 -113.74743731294333,
 -81.92497475833127,
 -21.232544839780584,
 -323.8692243556696,
 -42.36902778945438,
 -87.33420477354012,
 -65.1787456408469,
 -126.59590983922715,
 -79.69977935933056,
 -223.40939606358052,
 -87.69585130735