In [1]:
import numpy as np 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input,Dense
from tensorflow.keras.models import Model
import gym

In [2]:

from tqdm import tqdm
import os
import time 

In [11]:
epsilon = 0.50  
EPSILON_DECAY = 0.998 
MIN_EPSILON = 0.01

In [12]:
env = gym.make('LunarLander-v2')

In [13]:
class ActionValueNetwork:
    def __init__(self, network_config):
        self.state_dim = network_config.get("state_dim")
        self.num_hidden_units = network_config.get("num_hidden_units")
        self.num_actions = network_config.get("num_actions")
        self.step_size=network_config.get('step_size')
    def create_model(self):
        i = Input(shape=self.state_dim)
        x = Dense(256, activation='relu')(i)
        x = Dense(128, activation='relu')(x)
        x = Dense(self.num_actions, activation='linear')(x)
        model = Model(i, x)
        model.compile(optimizer=Adam(lr=self.step_size),loss='mse')
        return model

In [14]:
class ReplayBuffer:
    def __init__(self, size, minibatch_size, seed):
     
        self.buffer = []
        self.minibatch_size = minibatch_size
        self.rand_generator = np.random.RandomState(seed)
        self.max_size = size

    def append(self, state, action, reward, terminal, next_state):
      
        if len(self.buffer) == self.max_size:
            del self.buffer[0]
        self.buffer.append([state, action, reward, terminal, next_state])

    def sample(self):
       
        idxs = self.rand_generator.choice(np.arange(len(self.buffer)), size=self.minibatch_size)
        return [self.buffer[idx] for idx in idxs]

    def size(self):
        return len(self.buffer)

In [15]:

class Agent:
    def __init__(self, agent_config):
       
        self.replay_buffer = ReplayBuffer(agent_config['replay_buffer_size'], 
                                          agent_config['minibatch_sz'], agent_config.get("seed"))
        self.network = ActionValueNetwork(agent_config['network_config'])
        
        self.model=self.network.create_model()
        
        self.target_model=self.network.create_model()
        
        self.num_actions = agent_config['network_config']['num_actions']
        
        self.num_replay = agent_config['num_replay_updates_per_step']
        self.discount = agent_config['gamma']
        
        self.rand_generator = np.random.RandomState(agent_config.get("seed"))
        
        self.last_state = None
        self.last_action = None
        self.epsilon = epsilon
        self.sum_rewards = 0
        self.episode_steps = 0

    def policy(self, state):
       
        action_values =self.model.predict(state)
        if (np.random.uniform() < self.epsilon) or (action_values.all() == 0):
            action = np.random.randint(0, env.action_space.n)
        else:
            action=np.argmax(action_values)
        return action

    
    def agent_start(self):
       
        
        self.sum_rewards = 0
        self.episode_steps = 0
        self.last_state = env.reset()
        self.last_state = np.reshape(self.last_state,(-1,self.last_state.shape[0]))
        self.last_action = self.policy(self.last_state)
        return self.last_action


    def agent_step(self, state,reward,terminal):
      
        
        self.sum_rewards += reward
        self.episode_steps += 1
        
        state = np.array([state])
      
     
        action = self.policy(state)
       
        
        
       
        self.replay_buffer.append(self.last_state, self.last_action, reward, terminal, state)
      
        if self.replay_buffer.size() > self.replay_buffer.minibatch_size:
            self.target_model.set_weights(self.model.get_weights())
            for _ in range(self.num_replay):
                experiences = self.replay_buffer.sample()
                self.agent_train(experiences)
       
        self.last_state = state
        self.last_action = action
       
        
        return action

  
    def agent_train(self,experiences):
        states, actions, rewards, terminals, next_states = map(list, zip(*experiences))
        states = np.concatenate(states)
        next_states = np.concatenate(next_states)
        rewards = np.array(rewards)
        terminals = np.array(terminals)
        batch_size1 = states.shape[0]
        q_next_mat = self.target_model.predict(next_states)
        
        v_next_vec = np.max(q_next_mat, axis=1)*(1-terminals)
        
        target_vec = rewards + self.discount*v_next_vec
       
        q_mat = self.model.predict(states)
      
        batch_indices = np.arange(q_mat.shape[0])

        X=states
        q_mat[batch_indices,actions] = target_vec
 
        self.model.fit(X,q_mat,batch_size=batch_size1,verbose=0,shuffle=False)
        
    def agent_message(self, message):
        if message == "get_sum_reward":
            return self.sum_rewards
        else:
            raise Exception("Unrecognized Message!")
    

In [16]:
agent_info = {
             'network_config': {
                 'state_dim': 8,
                 'num_actions': 4,
                 'step_size':1e-3
             },
             'replay_buffer_size': 50000,
             'minibatch_sz': 64,
             'num_replay_updates_per_step': 4,
             'gamma': 0.90,
             'seed': 0}
EPISODES = 300

In [17]:
agent=Agent(agent_info)

In [18]:
reward_episode=[]
no_episodes=[]
episode_steps=[]
eps_history=[]

In [19]:
for episode in range(0,300):
    action=agent.agent_start()
    terminal=0
    while terminal!=1:
        state,reward,terminal,info=env.step(action)
        if terminal==True:
            terminal=1
        else:
            terminal=0
        action=agent.agent_step(state,reward,terminal)
        if agent.epsilon > MIN_EPSILON:
            agent.epsilon *= EPSILON_DECAY
            agent.epsilon = max(MIN_EPSILON,agent.epsilon)
    reward = agent.agent_message('get_sum_reward')
    reward_episode.append(reward)
    no_episodes.append(episode)
    episode_steps.append(agent.episode_steps)
    eps_history.append(agent.epsilon)
    print('episode: ', episode,'score: ',reward,
            'epsilon %.2f' % agent.epsilon, 'steps', agent.episode_steps)

episode:  0 score:  -29.82446158034645 epsilon 0.39 steps 129
episode:  1 score:  -321.883786029751 epsilon 0.30 steps 122
episode:  2 score:  -186.8153063989875 epsilon 0.21 steps 182
episode:  3 score:  -210.5208806556791 epsilon 0.14 steps 201
episode:  4 score:  -292.7349522027925 epsilon 0.09 steps 209
episode:  5 score:  -84.42265408261548 epsilon 0.07 steps 110
episode:  6 score:  -103.59464628879137 epsilon 0.01 steps 1000
episode:  7 score:  -72.3297971384106 epsilon 0.01 steps 1000
episode:  8 score:  -133.1480804593578 epsilon 0.01 steps 222
episode:  9 score:  -183.345134609099 epsilon 0.01 steps 293
episode:  10 score:  -253.73742894662695 epsilon 0.01 steps 342
episode:  11 score:  -147.61237216844987 epsilon 0.01 steps 1000
episode:  12 score:  -45.29568617101543 epsilon 0.01 steps 86
episode:  13 score:  -178.1397165778257 epsilon 0.01 steps 1000
episode:  14 score:  -178.57963251468416 epsilon 0.01 steps 1000
episode:  15 score:  -119.23897126672368 epsilon 0.01 steps 

episode:  128 score:  -117.20590402924604 epsilon 0.01 steps 1000
episode:  129 score:  -469.0419296989953 epsilon 0.01 steps 992
episode:  130 score:  -113.6337029071422 epsilon 0.01 steps 1000
episode:  131 score:  -131.43086516367464 epsilon 0.01 steps 1000
episode:  132 score:  -118.66066704240725 epsilon 0.01 steps 1000
episode:  133 score:  -123.69785348677567 epsilon 0.01 steps 1000
episode:  134 score:  -107.66379586762297 epsilon 0.01 steps 1000
episode:  135 score:  -57.075006317824204 epsilon 0.01 steps 1000
episode:  136 score:  -121.25396411118615 epsilon 0.01 steps 1000
episode:  137 score:  -88.43403803931629 epsilon 0.01 steps 1000
episode:  138 score:  -132.76462050150803 epsilon 0.01 steps 1000
episode:  139 score:  -130.58912319707937 epsilon 0.01 steps 1000
episode:  140 score:  -97.02765294126144 epsilon 0.01 steps 1000
episode:  141 score:  -86.72142755875994 epsilon 0.01 steps 1000
episode:  142 score:  -179.9172416474044 epsilon 0.01 steps 607
episode:  143 scor

KeyboardInterrupt: 

In [20]:
reward_episode

[-29.82446158034645,
 -321.883786029751,
 -186.8153063989875,
 -210.5208806556791,
 -292.7349522027925,
 -84.42265408261548,
 -103.59464628879137,
 -72.3297971384106,
 -133.1480804593578,
 -183.345134609099,
 -253.73742894662695,
 -147.61237216844987,
 -45.29568617101543,
 -178.1397165778257,
 -178.57963251468416,
 -119.23897126672368,
 -178.29876627188227,
 -148.45101069277376,
 -133.05012778978684,
 -125.38108153834564,
 -169.3197543022831,
 -133.4177768633609,
 -132.41334881857216,
 -148.3856544204037,
 -92.40265627173315,
 -165.7697331384475,
 -146.40530996482832,
 -128.05305134010658,
 -23.894577667000238,
 -105.31196514992077,
 -122.72796506718481,
 -51.928286741696056,
 -124.05677284825538,
 -258.6735189838123,
 -169.98175731102802,
 -160.77580148833678,
 -70.46190079696862,
 -142.54148253129017,
 -128.27570781933963,
 -67.64726156784779,
 -132.47926093277982,
 -59.10039451144962,
 -137.29107331835928,
 -318.742928191755,
 -156.3908660177088,
 -142.34913188999428,
 -141.27336085

In [21]:
episode_steps

[129,
 122,
 182,
 201,
 209,
 110,
 1000,
 1000,
 222,
 293,
 342,
 1000,
 86,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 711,
 407,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 929,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 706,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 190,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 69,
 1000,
 250,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 184,
 1000,
 1000,
 930,
 1000,
 1000,
 622,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 517,
 1000,
 1000,
 1000,
 483,
 1000,
 992,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 1000,
 607,
 1000,
 1000,
 1000,
 100

In [22]:
np.mean(episode_steps)

880.6904761904761