In [None]:
import numpy as np;
import pandas as pd;
import collections;
import random;
import gym;
from copy import deepcopy;
import tensorflow as tf;
import matplotlib.pyplot as plt;
import time;

In [None]:
class BriefHistory:
    def __init__(self,history_size):
        self.history_size=history_size
        self.current_state_queue=collections.deque(maxlen=history_size)
        self.action_queue=collections.deque(maxlen=history_size)
        self.next_state_queue=collections.deque(maxlen=history_size)
        self.reward_queue=collections.deque(maxlen=history_size)
        self.done_queue=collections.deque(maxlen=history_size)
        self.queues=[self.current_state_queue,self.action_queue,\
                     self.next_state_queue,self.reward_queue,\
                     self.done_queue]
        self.count=0
        
    def append(self,current_state,action,next_state,reward,done):
        self.current_state_queue.append(current_state)
        self.action_queue.append(action)
        self.next_state_queue.append(next_state)
        self.reward_queue.append(reward)
        self.done_queue.append(done)
        self.count+=1
        
    @property
    def current_state_stacked(self):
        return np.array(self.current_state_queue)
    
    @property
    def action_stacked(self):
        return np.array(self.action_queue)
    
    @property
    def next_state_stacked(self):
        return np.array(self.next_state_queue)
    
    @property
    def reward_stacked(self):
        return np.array(self.reward_queue)
    
    @property
    def done_stacked(self):
        return np.array(self.done_queue)
    
    @property
    def full(self):
        return self.count>=self.history_size  
    
    def __iter__(self):
        for current_state,action,next_state,reward,done in zip(*self.queues):
            yield current_state,action,next_state,reward,done
    
    
    
class Agent:

    def __init__(self,env=gym.make("LunarLander-v2"),q_online=None,\
                 history_size=20000,number_of_episodes=1000,epsilon=1,\
                 gamma=0.99,optimizer_lr=0.0001,seed=None,epsilon_coef=500,tau=0.005):
        self.tau=tau
        self.seed=seed
        np.random.seed(self.seed)
        random.seed(self.seed)
        tf.random.set_random_seed(self.seed)
        tf.compat.v1.set_random_seed(self.seed)
        self.env=env
        self.env.seed(self.seed)
        self.epsilon=epsilon
        self.gamma=gamma
        self.optimizer_lr=optimizer_lr
        if q_online==None:
            self.q_online=self.initialize_q()
            self.q_offline=self.initialize_q()
        else:
            self.q_online=q_online
        
        self.history=BriefHistory(history_size)
        self.target_update_interval=10
        self.number_of_episodes=number_of_episodes
        self.total_rewards=[]
        self.epsilon_coef=epsilon_coef
    def initialize_q(self):
        
        inputs = tf.keras.layers.Input(shape=(self.env.observation_space.shape[0],))
        initializer=tf.keras.initializers.RandomUniform(minval=-0.05, maxval=0.05, seed=self.seed)
        w = tf.keras.layers.Dense(128, activation='relu',kernel_initializer=initializer)(inputs)
        w = tf.keras.layers.Dense(64, activation='relu',kernel_initializer=initializer)(w)
        w = tf.keras.layers.Dense(32, activation='relu',kernel_initializer=initializer)(w)
        output = tf.keras.layers.Dense(self.env.action_space.n,kernel_initializer=initializer)(w)
        model = tf.keras.models.Model(inputs=inputs, outputs=output)
        optimizer = tf.keras.optimizers.RMSprop(lr=self.optimizer_lr)
        model.compile(loss='mse', optimizer=optimizer);
        return model
    
    def get_action(self,current_state):
        is_random=random.random()<self.epsilon
        if is_random:
            action=np.random.randint(low=0,high=4)
        else:
            action=self.get_best_action_q_online(current_state)
        return action

    def run(self):
        for i in range(self.number_of_episodes):
            total_reward=0
            done=False
            current_state=self.env.reset()
            count=0
            while not done:
                count+=1
                action=self.get_action(current_state)
                next_state,reward,done,_=self.env.step(action)
                self.history.append(current_state,action,next_state,reward,done)
                total_reward+=reward
                current_state=next_state
                self.experience_replay()
                self.update_q_offline()
            if (i+1)%25==0 or i==0:
                print("Episode: {:<4}     Episode Reward: {:<4}     Epsilon: {:.2f}     Length of Episode: {:<4}".format(i+1,int(total_reward),self.epsilon,count))
            self.decay_epsilon()
            self.total_rewards.append(total_reward)
            


    def experience_replay(self,verbose=0):
        x=self.history.current_state_stacked
        sampled_index=np.random.randint(low=0,high=len(x),size=100)
        y=self.get_ys(sampled_index)
        self.q_online.fit(x[sampled_index],y,epochs=1,shuffle=False,batch_size=20,verbose=verbose)      

    def update_q_offline(self):
        updated_weights=np.array(self.q_offline.get_weights())*(1-self.tau)+self.tau*np.array(self.q_online.get_weights())
        self.q_offline.set_weights(updated_weights)        

    
    def get_ys(self,sample_index):
        n=len(sample_index)
        ys=self.q_online.predict(self.history.current_state_stacked[sample_index])
        ys[range(n),self.history.action_stacked[sample_index]]=self.history.reward_stacked[sample_index]
        not_done_indices=~self.history.done_stacked[sample_index]
        q_star=self.q_offline.predict(self.history.next_state_stacked[sample_index]).max(axis=1)
        ys[range(n),self.history.action_stacked[sample_index]]+=self.gamma*q_star*not_done_indices
        return ys
    
    
    def get_best_action_q_online(self,state):
        xs=np.array([state])
        ys=self.q_online.predict(xs)
        return np.argmax(ys)
        
    def get_q_offline(self,state):
        return self.q_offline.predict(np.array(state).reshape(1,-1))[0]
    
    def get_q_online(self,state):
        return self.q_online.predict(np.array(state).reshape(1,-1))[0]
    
    def decay_epsilon(self):
        self.epsilon=10**(-len(self.total_rewards)/self.epsilon_coef)

In [None]:
agent=Agent()
agent.run()

In [None]:
env=gym.make("LunarLander-v2")
done=False
state=env.reset()
env.render()
while not done:
    action=agent.get_best_action_q_online(state)
    state,_,done,_=env.step(action)
    env.render()
env.close()