In [1]:
import numpy as np 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input,Dense
from tensorflow.keras.models import Model



In [2]:
from tqdm import tqdm
import os
import time 

In [3]:
import gym
import time
env = gym.make('LunarLander-v2')

In [5]:
import matplotlib.pyplot as plt

In [6]:
epsilon = 1  
EPSILON_DECAY = 0.998 
MIN_EPSILON = 0.01

In [7]:
class ActionValueNetwork:
    def __init__(self, network_config):
        self.state_dim = network_config.get("state_dim")
        self.num_hidden_units = network_config.get("num_hidden_units")
        self.num_actions = network_config.get("num_actions")
        self.step_size=network_config.get('step_size')
    def create_model(self):
        i = Input(shape=self.state_dim)
        x = Dense(256, activation='relu')(i)
        x = Dense(self.num_actions, activation='linear')(x)
        model = Model(i, x)
        model.compile(optimizer=Adam(lr=self.step_size),loss='mse')
        return model

In [8]:
class ReplayBuffer:
    def __init__(self, size, minibatch_size, seed):
       
        self.buffer = []
        self.minibatch_size = minibatch_size
        self.rand_generator = np.random.RandomState(seed)
        self.max_size = size

    def append(self, state, action, reward, terminal, next_state):
       
        if len(self.buffer) == self.max_size:
            del self.buffer[0]
        self.buffer.append([state, action, reward, terminal, next_state])

    def sample(self):
        
        idxs = self.rand_generator.choice(np.arange(len(self.buffer)), size=self.minibatch_size)
        return [self.buffer[idx] for idx in idxs]

    def size(self):
        return len(self.buffer)

In [9]:
class Agent:
    def __init__(self, agent_config):
        
        self.replay_buffer = ReplayBuffer(agent_config['replay_buffer_size'], 
                                          agent_config['minibatch_sz'], agent_config.get("seed"))
        self.network = ActionValueNetwork(agent_config['network_config'])
        
        self.model=self.network.create_model()
        
        self.target_model=self.network.create_model()
        
        self.num_actions = agent_config['network_config']['num_actions']
        
        self.num_replay = agent_config['num_replay_updates_per_step']
        self.discount = agent_config['gamma']
        
        self.rand_generator = np.random.RandomState(agent_config.get("seed"))
        
        self.last_state = None
        self.last_action = None
        self.epsilon = epsilon
        self.sum_rewards = 0
        self.episode_steps = 0

    
    def policy(self, state):
        
        action_values =self.model.predict(state)
        if (np.random.uniform() < self.epsilon):
            action = np.random.randint(0, env.action_space.n)
        else:
            action=np.argmax(action_values)
        return action

    
    def agent_start(self):
       
        
        self.sum_rewards = 0
        self.episode_steps = 0
        self.last_state = env.reset()
        self.last_state = np.reshape(self.last_state,(-1,self.last_state.shape[0]))
        self.last_action = self.policy(self.last_state)
        return self.last_action


    def agent_step(self, state,reward,terminal):
      
        
        self.sum_rewards += reward
        self.episode_steps += 1

        
        state = np.array([state])
        
     
        action = self.policy(state)
       
        
       

       
        self.replay_buffer.append(self.last_state, self.last_action, reward, terminal, state)
       
        
        
        if self.replay_buffer.size() > self.replay_buffer.minibatch_size:
            self.target_model.set_weights(self.model.get_weights())
            for _ in range(self.num_replay):
                experiences = self.replay_buffer.sample()
                self.agent_train(experiences)
        
        self.last_state = state
        self.last_action = action
       
        return action

    
    def agent_train(self,experiences):
        states, actions, rewards, terminals, next_states = map(list, zip(*experiences))
        states = np.concatenate(states)
        next_states = np.concatenate(next_states)
        rewards = np.array(rewards)
        terminals = np.array(terminals)
        batch_size1 = states.shape[0]
        
        q_next_mat = self.target_model.predict(next_states)
        
        v_next_vec = np.max(q_next_mat, axis=1)*(1-terminals)
        
        target_vec = rewards + self.discount*v_next_vec
        
        q_mat = self.model.predict(states)
        
        batch_indices = np.arange(q_mat.shape[0])
        
        q_mat[batch_indices,actions] = target_vec
        X=states
        q_mat[batch_indices,actions] = target_vec
        self.model.fit(X,q_mat,batch_size=batch_size1,verbose=0,shuffle=False)
        
    def agent_message(self, message):
        if message == "get_sum_reward":
            return self.sum_rewards
        else:
            raise Exception("Unrecognized Message!")
    

In [10]:
agent_info = {
             'network_config': {
                 'state_dim': 8,
                 'num_actions': 4,
                 'step_size':1e-2
             },
             'replay_buffer_size': 50000,
             'minibatch_sz': 64,
             'num_replay_updates_per_step': 4,
             'gamma': 0.99,
             'seed': 0}
EPISODES = 500

In [11]:
agent=Agent(agent_info)

In [12]:
reward_episode=[]
no_episodes=[]
episode_steps=[]
epsilon_history=[]

In [13]:
for episode in range(0, 300):
    action=agent.agent_start()
    terminal=0
    while terminal!=1:
        state,reward,terminal,info=env.step(action)
        if terminal==True:
            terminal=1
        else:
            terminal=0
        action=agent.agent_step(state,reward,terminal)
        if agent.epsilon > MIN_EPSILON:
            agent.epsilon *= EPSILON_DECAY
            agent.epsilon = max(MIN_EPSILON, agent.epsilon)
    reward = agent.agent_message('get_sum_reward')
    reward_episode.append(agent.sum_rewards)
    no_episodes.append(episode)
    episode_steps.append(agent.episode_steps)
    epsilon_history.append(agent.epsilon)
    print('episode: ', episode,'score: ', agent.sum_rewards,
            'epsilon %.2f' % agent.epsilon, 'steps', agent.episode_steps)

episode:  0 score:  -483.7613749657774 epsilon 0.69 steps 74
episode:  1 score:  -179.75979759801032 epsilon 0.37 steps 124
episode:  2 score:  -111.38105927897422 epsilon 0.13 steps 208
episode:  3 score:  -403.27857461416835 epsilon 0.06 steps 144
episode:  4 score:  -144.62576550173281 epsilon 0.05 steps 60
episode:  5 score:  -374.67382252445094 epsilon 0.03 steps 92
episode:  6 score:  -561.5091650250342 epsilon 0.02 steps 103
episode:  7 score:  -457.5148325713632 epsilon 0.01 steps 86
episode:  8 score:  -364.2684488764166 epsilon 0.01 steps 103
episode:  9 score:  -77.9118786372023 epsilon 0.01 steps 137
episode:  10 score:  -528.4160927266716 epsilon 0.01 steps 89
episode:  11 score:  -476.09685672502 epsilon 0.01 steps 131
episode:  12 score:  -262.6952398949579 epsilon 0.01 steps 136
episode:  13 score:  -71.0529530130053 epsilon 0.01 steps 133
episode:  14 score:  -580.0704426173245 epsilon 0.01 steps 206
episode:  15 score:  -351.0170252265591 epsilon 0.01 steps 111
episod

KeyboardInterrupt: 

In [14]:
reward_episode

[-483.7613749657774,
 -179.75979759801032,
 -111.38105927897422,
 -403.27857461416835,
 -144.62576550173281,
 -374.67382252445094,
 -561.5091650250342,
 -457.5148325713632,
 -364.2684488764166,
 -77.9118786372023,
 -528.4160927266716,
 -476.09685672502,
 -262.6952398949579,
 -71.0529530130053,
 -580.0704426173245,
 -351.0170252265591,
 -338.661584731484,
 20.70329643058713,
 -406.56105862578295,
 8.111628930978085,
 -98.47540291124469,
 -72.01496184838606,
 -221.9651995414218,
 -69.08043680902784,
 -227.68465253127164,
 -131.29031471328335,
 -655.8441398863646,
 -195.05306682394547,
 -74.21852959846585,
 -92.9947403929726,
 -270.4924641232796,
 -80.8050980725408,
 -234.90242419698555,
 -229.05348663295743,
 -479.74220086287505,
 -352.4362798875869,
 -210.22577963532433,
 -73.09353248450526,
 -374.38733050739756,
 -127.82865740102844,
 -138.34580711346567,
 -58.16069315125866,
 -232.16437731216124,
 -127.04485263232115,
 -122.684440226487,
 -75.99270147435381,
 -146.83871369137452,
 -13

In [15]:
episode_steps

[74,
 124,
 208,
 144,
 60,
 92,
 103,
 86,
 103,
 137,
 89,
 131,
 136,
 133,
 206,
 111,
 117,
 147,
 243,
 122,
 113,
 118,
 127,
 114,
 369,
 357,
 514,
 216,
 118,
 117,
 95,
 314,
 141,
 1000,
 519,
 495,
 428,
 161,
 867,
 249,
 1000,
 165,
 958,
 380,
 63,
 74,
 1000,
 157,
 128,
 1000,
 184,
 237,
 1000,
 1000,
 1000,
 1000,
 1000,
 626,
 1000,
 1000,
 430,
 221,
 497,
 378,
 1000,
 1000,
 1000,
 1000,
 1000,
 549,
 176,
 316,
 566,
 345,
 910,
 1000,
 433,
 1000,
 192,
 546,
 329,
 111,
 106,
 87,
 110,
 128,
 53,
 162,
 299,
 555,
 236,
 1000,
 1000,
 623,
 1000,
 413,
 1000,
 1000,
 1000,
 891,
 170,
 600]