In [2]:
import numpy as np 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input,Dense
from tensorflow.keras.models import Model



In [2]:

from tqdm import tqdm
import os
import time 

In [3]:
import matplotlib.pyplot as plt

In [4]:
import gym
import time
env = gym.make('LunarLander-v2')

In [5]:
def plot_learning_curve(x, scores, epsilons, filename, lines=None):
    fig=plt.figure()
    ax=fig.add_subplot(111, label="1")
    ax2=fig.add_subplot(111, label="2", frame_on=False)

    ax.plot(x, epsilons, color="C0")
    ax.set_xlabel("Training Steps", color="C0")
    ax.set_ylabel("Epsilon", color="C0")
    ax.tick_params(axis='x', colors="C0")
    ax.tick_params(axis='y', colors="C0")

    N = len(scores)
    running_avg = np.empty(N)
    for t in range(N):
	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])

    ax2.scatter(x, running_avg, color="C1")
    ax2.axes.get_xaxis().set_visible(False)
    ax2.yaxis.tick_right()
    ax2.set_ylabel('Score', color="C1")
    ax2.yaxis.set_label_position('right')
    ax2.tick_params(axis='y', colors="C1")

    if lines is not None:
        for line in lines:
            plt.axvline(x=line)

    plt.savefig(filename)

In [6]:
epsilon = 1  
EPSILON_DECAY = 0.995 
MIN_EPSILON = 0.01

In [5]:
class ActionValueNetwork:
    def __init__(self, network_config):
        self.state_dim = network_config.get("state_dim")
        self.num_hidden_units = network_config.get("num_hidden_units")
        self.num_actions = network_config.get("num_actions")
        self.step_size=network_config.get('step_size')
    def create_model(self):
        i = Input(shape=self.state_dim)
        x = Dense(256, activation='relu')(i)
        x = Dense(self.num_actions, activation='linear')(x)
        model = Model(i, x)
        model.compile(optimizer=Adam(lr=self.step_size),loss='mse')
        return model

In [7]:
network_config= {
                 'state_dim': 8,
                 'num_actions': 4,
                 'step_size':1e-3
             }

In [10]:
k=ActionValueNetwork(network_config)

In [11]:
model=k.create_model()

In [13]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
dense (Dense)                (None, 256)               2304      
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 1028      
Total params: 3,332
Trainable params: 3,332
Non-trainable params: 0
_________________________________________________________________


In [8]:
class ReplayBuffer:
    def __init__(self, size, minibatch_size, seed):

      
    def append(self, state, action, reward, terminal, next_state):
     
        if len(self.buffer) == self.max_size:
            del self.buffer[0]
        self.buffer.append([state, action, reward, terminal, next_state])

    def sample(self):
      
        idxs = self.rand_generator.choice(np.arange(len(self.buffer)), size=self.minibatch_size)
        return [self.buffer[idx] for idx in idxs]

    def size(self):
        return len(self.buffer)

In [9]:
class Agent:
    def __init__(self, agent_config):
    
        self.replay_buffer = ReplayBuffer(agent_config['replay_buffer_size'], 
                                          agent_config['minibatch_sz'], agent_config.get("seed"))
        self.network = ActionValueNetwork(agent_config['network_config'])
        
        self.model=self.network.create_model()
        
        self.target_model=self.network.create_model()
        
        self.num_actions = agent_config['network_config']['num_actions']
        
        self.num_replay = agent_config['num_replay_updates_per_step']
        self.discount = agent_config['gamma']
        
        self.rand_generator = np.random.RandomState(agent_config.get("seed"))
        
        self.last_state = None
        self.last_action = None
        self.epsilon = epsilon
        self.sum_rewards = 0
        self.episode_steps = 0

    
    def policy(self, state):
      
        action_values =self.model.predict(state)
        if (np.random.uniform() < self.epsilon):
            action = np.random.randint(0, env.action_space.n)
        else:
            action=np.argmax(action_values)
        return action

   
    def agent_start(self):
       
        
        self.sum_rewards = 0
        self.episode_steps = 0
        self.last_state = env.reset()
        self.last_state = np.reshape(self.last_state,(-1,self.last_state.shape[0]))
        self.last_action = self.policy(self.last_state)
        return self.last_action


    def agent_step(self, state,reward,terminal):
      
        
        self.sum_rewards += reward
        self.episode_steps += 1

        
        state = np.array([state])
      
     
        action = self.policy(state)
       
        

       
        self.replay_buffer.append(self.last_state, self.last_action, reward, terminal, state)
       
        
        
        if self.replay_buffer.size() > self.replay_buffer.minibatch_size:
            self.target_model.set_weights(self.model.get_weights())
            for _ in range(self.num_replay):
                experiences = self.replay_buffer.sample()
                self.agent_train(experiences)
       
        self.last_state = state
        self.last_action = action
      
        
        return action


    def agent_train(self,experiences):
        states, actions, rewards, terminals, next_states = map(list, zip(*experiences))
        states = np.concatenate(states)
        next_states = np.concatenate(next_states)
        rewards = np.array(rewards)
        terminals = np.array(terminals)
        batch_size1 = states.shape[0]
        
        q_next_mat = self.target_model.predict(next_states)
        
        v_next_vec = np.max(q_next_mat, axis=1)*(1-terminals)
        
        target_vec = rewards + self.discount*v_next_vec
        
        q_mat = self.model.predict(states)
        
        batch_indices = np.arange(q_mat.shape[0])
        
        q_mat[batch_indices,actions] = target_vec
        X=states
        q_mat[batch_indices,actions] = target_vec
        self.model.fit(X,q_mat,batch_size=batch_size1,verbose=0,shuffle=False)
        
    def agent_message(self, message):
        if message == "get_sum_reward":
            return self.sum_rewards
        else:
            raise Exception("Unrecognized Message!")
    

In [10]:
agent_info = {
             'network_config': {
                 'state_dim': 8,
                 'num_actions': 4,
                 'step_size':1e-3
             },
             'replay_buffer_size': 50000,
             'minibatch_sz': 64,
             'num_replay_updates_per_step': 4,
             'gamma': 0.99,
             'seed': 0}
EPISODES = 500

In [13]:
agent=Agent(agent_info)

In [14]:
reward_episode=[]
no_episodes=[]
episode_steps=[]
epsilon_history=[]

In [15]:
for episode in range(0, 501):
    action=agent.agent_start()
    terminal=0
    while terminal!=1:
        state,reward,terminal,info=env.step(action)
        if terminal==True:
            terminal=1
        else:
            terminal=0
        action=agent.agent_step(state,reward,terminal)
        if agent.epsilon > MIN_EPSILON:
            agent.epsilon *= EPSILON_DECAY
            agent.epsilon = max(MIN_EPSILON, agent.epsilon)
    reward = agent.agent_message('get_sum_reward')
    reward_episode.append(agent.sum_rewards)
    no_episodes.append(episode)
    episode_steps.append(agent.episode_steps)
    epsilon_history.append(agent.epsilon)
    print('episode: ', episode,'score: ', agent.sum_rewards,
            'epsilon %.2f' % agent.epsilon, 'steps', agent.episode_steps)

episode:  0 score:  -352.6464069255609 epsilon 0.61 steps 98
episode:  1 score:  -360.8051091745731 epsilon 0.31 steps 134
episode:  2 score:  -221.198762818357 epsilon 0.17 steps 118
episode:  3 score:  -96.31208567747527 epsilon 0.07 steps 193
episode:  4 score:  -152.89634745560977 epsilon 0.03 steps 179
episode:  5 score:  -116.91340002696666 epsilon 0.01 steps 321
episode:  6 score:  -12.52097852451098 epsilon 0.01 steps 253
episode:  7 score:  -75.72012842864669 epsilon 0.01 steps 216
episode:  8 score:  -37.85200864642361 epsilon 0.01 steps 226
episode:  9 score:  28.46728851428915 epsilon 0.01 steps 249
episode:  10 score:  47.99370847988746 epsilon 0.01 steps 306
episode:  11 score:  -224.42196342842612 epsilon 0.01 steps 460
episode:  12 score:  -88.66180742943585 epsilon 0.01 steps 209
episode:  13 score:  -132.71609711483387 epsilon 0.01 steps 643
episode:  14 score:  -120.59914714847973 epsilon 0.01 steps 1000
episode:  15 score:  -96.63980214487773 epsilon 0.01 steps 607


episode:  130 score:  209.6475815750456 epsilon 0.01 steps 463
episode:  131 score:  -94.49704826640534 epsilon 0.01 steps 383
episode:  132 score:  258.67753069591913 epsilon 0.01 steps 380
episode:  133 score:  96.08160941178538 epsilon 0.01 steps 1000
episode:  134 score:  214.73535975089388 epsilon 0.01 steps 494
episode:  135 score:  148.87878942696327 epsilon 0.01 steps 1000
episode:  136 score:  225.07962649671276 epsilon 0.01 steps 595
episode:  137 score:  123.72584578496411 epsilon 0.01 steps 1000
episode:  138 score:  270.41581945853545 epsilon 0.01 steps 453
episode:  139 score:  263.649826002744 epsilon 0.01 steps 528
episode:  140 score:  141.3523723384864 epsilon 0.01 steps 1000
episode:  141 score:  126.44168514602333 epsilon 0.01 steps 1000
episode:  142 score:  -161.5644805765101 epsilon 0.01 steps 162
episode:  143 score:  -18.940759812855163 epsilon 0.01 steps 248
episode:  144 score:  -223.48085773390235 epsilon 0.01 steps 192
episode:  145 score:  263.265826107360

episode:  258 score:  134.62217872854566 epsilon 0.01 steps 1000
episode:  259 score:  153.00044894125972 epsilon 0.01 steps 1000
episode:  260 score:  144.25147006925297 epsilon 0.01 steps 1000
episode:  261 score:  -138.75963652585745 epsilon 0.01 steps 128
episode:  262 score:  -143.36334148449416 epsilon 0.01 steps 110
episode:  263 score:  -29.998919621267206 epsilon 0.01 steps 298
episode:  264 score:  -163.93270174785658 epsilon 0.01 steps 754
episode:  265 score:  206.0615789538214 epsilon 0.01 steps 393
episode:  266 score:  251.2257149640614 epsilon 0.01 steps 434
episode:  267 score:  126.25164983817527 epsilon 0.01 steps 1000
episode:  268 score:  -32.26639803961764 epsilon 0.01 steps 181
episode:  269 score:  -46.501452217020066 epsilon 0.01 steps 418
episode:  270 score:  -25.478563563738426 epsilon 0.01 steps 1000


KeyboardInterrupt: 

In [50]:
len(avg_reward_2)

301