In [0]:
import numpy as np
import tensorflow as tf
import gym
import os
import datetime
from gym import wrappers
from collections import deque, Counter
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow import keras
from tensorflow.keras.initializers import VarianceScaling



In [0]:
def preprocess(image):
    img = tf.image.resize_with_pad(image,80,80)
    img = tf.image.crop_to_bounding_box(img,9,9,70,70)
    img = np.mean(img,axis=2)
    return img
def atleast_4d(x):
    if x.ndim < 4:
        y = tf.expand_dims(np.atleast_3d(x), axis=0)
    else:
        y = x
    return y

def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocess the frame
    frame = preprocess(state)
    if is_new_episode:
        # Element-wise max summation
        maxframe = np.maximum(frame,frame)
        # Append frame to deque
        stacked_frames.append(maxframe)
        stacked_frames.append(maxframe)
        stacked_frames.append(maxframe)
        stacked_frames.append(maxframe)
        # Stack the frame
        stacked_state = np.stack(stacked_frames,axis=2)
    else:
        maxframe=np.maximum(stacked_frames[-1],frame)
        stacked_frames.append(maxframe)
        stacked_state = np.stack(stacked_frames,axis=2)
    return stacked_state, stacked_frames

# Building Network Class

In [0]:
# Network class are built to define the network and implement forward pass manually
class Network(tf.keras.Model):
    def __init__(self, num_states, hidden_units, num_actions):
        super(Network, self).__init__()
        self.input_layer = tf.keras.layers.InputLayer() # Input layers
        self.flatten_layer = tf.keras.layers.Flatten()
        self.conv = tf.keras.layers.Conv2D(32,(3,3),activation = 'relu',padding='same', kernel_initializer=VarianceScaling(scale=2.))
        self.conv2 = tf.keras.layers.Conv2D(64,(3,3),activation = 'relu',padding='same', kernel_initializer=VarianceScaling(scale=2.))
        self.maxp = tf.keras.layers.MaxPooling2D((2,2),strides=2)
        self.adv = tf.keras.layers.Dense(1,kernel_initializer=VarianceScaling(scale=2.))
        self.val = tf.keras.layers.Dense(num_actions,kernel_initializer=VarianceScaling(scale=2.))
        self.drop = tf.keras.layers.Dropout(0.2)
        self.cussp = tf.keras.layers.Lambda(lambda w: tf.split(w,2,3))
        self.redm = tf.keras.layers.Lambda(lambda w: tf.reduce_mean(w, axis=1, keepdims =True))
        self.hidden_layers = [] # List of hidden layers
        for i in hidden_units: # Create and append layers to the hidden layer list
            self.hidden_layers.append(tf.keras.layers.Dense(
            i,activation='relu',kernel_initializer='RandomNormal'))         
#         self.output_layer = tf.keras.layers.Dense(
#             num_actions,activation='linear', kernel_initializer = 'RandomNormal')
        
    @tf.function
    def call(self, inputs): # Forward passing
#         inputs = preprocess(inputs)
        z = self.input_layer(inputs) # Assign input to a layer
        z = self.conv(z)
        z = self.maxp(z)
        z = self.conv2(z)
        z = self.maxp(z)
        z = self.drop(z)
        for layer in self.hidden_layers: # Passes the input layer through all of the hidden layers
            z = layer(z)
        val_stream, adv_stream = self.cussp(z)
        val_stream = self.flatten_layer(val_stream)
        val = self.val(val_stream)
        adv_stream = self.flatten_layer(adv_stream)
        adv = self.adv(adv_stream)
        q_val = tf.keras.layers.Add()([val, tf.keras.layers.Subtract()([adv, self.redm(adv)])])
#         output = self.output_layer(z) # Return the output of the output layer
        return q_val
    
   

# Building DQN Model

In [0]:
class DQN:
    def __init__(self, num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr):
        self.num_actions = num_actions # Number of action
        self.gamma = gamma # Reward discount factor
        self.max_experiences = max_experiences # Max number of exp
        self.min_experiences = min_experiences # Min number of exp
        self.experience = {'s': [], 'a':[], 'r':[], 's2': [],'done': []} # Exp holder
        self.batch_size = batch_size # Batch size
        self.optimizer = tf.keras.optimizers.Adam(lr) # Optimizer with learning rate
        self.model = Network(num_states, hidden_units,num_actions) # Defining the model
        
    # Predict the next action with state as input
    def predict(self,inputs):
        return self.model(atleast_4d(inputs.astype('float32')))
    
    # Training the model
    def train(self, TargetNet):
        if len(self.experience['s']) < self.min_experiences: # If there are not enough experience to sample from
            return 0
        # Randomly sampling from experience (s,s',a,r)
        ids = np.random.randint (low=0, high=len(self.experience['s']),size = self.batch_size)
        states = np.asarray([self.experience['s'][i] for i in ids])
        actions = np.asarray([self.experience['a'][i] for i in ids])
        rewards = np.asarray([self.experience['r'][i] for i in ids])
        states_next = np.asarray([self.experience['s2'][i] for i in ids])
        
        # Check for termial state 
        dones = np.asarray([self.experience['done'][i] for i in ids])
        
        # Predicting the value of the next state Q(s') by using predict and inputting the next state (s')
        value_next = np.max(TargetNet.predict(states_next),axis =1)
        actual_values = np.where(dones,rewards,rewards+self.gamma*value_next)

        # Calculate the sqared loss between the real target and the prediction values
        with tf.GradientTape() as tape: # Recording the computation to compute the differentialtion bakcward (backpropagation)
            selected_action_values = tf.math.reduce_sum(
                self.predict(states)* tf.one_hot(actions, self.num_actions), axis =1)
            l = tf.keras.losses.Huber()
            loss = l(actual_values , selected_action_values)
        
        # Backpropagation     
        variables = self.model.trainable_variables # Call the weights of the model
        gradients = tape.gradient(loss,variables) # Setup gradients 
        self.optimizer.apply_gradients(zip(gradients, variables)) # Doing backprop
        return loss
    
    # Epsilon-Greedy Strategy
    def get_action(self,states, epsilon):
        if np.random.random() < epsilon:
            return np.random.choice(self.num_actions) # Exploration
        else:
            return np.argmax(self.predict(np.atleast_2d(states))[0]) # Exploitation
    
    # Adding and pushing experience (Replay Memories)
    def add_experience(self,exp):
        if len(self.experience['s']) >= self.max_experiences:
            for key in self.experience.keys():
                self.experience[key].pop(0)
                
        for key,value in exp.items():
            self.experience[key].append(value)
            
    # Copy weights from TrainNet to TargetNet
    def copy_weights(self, TrainNet):
        variables1 = self.model.trainable_variables
        variables2 = TrainNet.model.trainable_variables
        for v1, v2 in zip(variables1, variables2):
            v1.assign(v2.numpy())
    
  
    def save_model(self):
        self.model.save('/content/drive/My Drive/Google Drive/2020/Machine Learning/Jupyter Notebook/Assignment 2/model')


# Play and Save

In [0]:
def play_game(env, TrainNet, TargetNet, epsilon, copy_step):
    rewards = 0
    iters = 0
    done = False
    observations = env.reset()
    losses = list()
    stack_size = 4
    stacked_frames = deque([np.zeros((70,70,1),dtype=np.int)for i in range (stack_size)],maxlen =4)
    observations,stacked_frames = stack_frames(stacked_frames,observations,True)
    while not done:
        action = TrainNet.get_action(observations, epsilon)
        prev_observations = observations 
        observations, reward, done,_ = env.step(action)
        observations,stacked_frames = stack_frames(stacked_frames,observations,False)
        rewards += reward
        if env.ale.lives() < 3:
            done = True
        if done:
            reward = -200
            env.reset()
            
        exp = {'s': prev_observations, 'a': action, 'r': reward, 's2':observations, 'done': done}
        TrainNet.add_experience(exp)
        loss = TrainNet.train(TargetNet)
            
        # Convert loss to int if loss is not int
        if isinstance(loss,int):
            losses.append(loss)
        else:
            losses.append(loss.numpy())
            
        iters += 1
        if iters % copy_step == 0:
            TargetNet.copy_weights(TrainNet)
    return rewards, np.mean(losses)
    
def make_video(env,TrainNet):
    env = wrappers.Monitor(env, os.path.join(os.getcwd(), "videos"), force=True)
    rewards = 0
    steps = 0
    done = False
    observations = env.reset()
    stack_size = 4
    stacked_frames = deque([np.zeros((70,70,1),dtype=np.int)for i in range (stack_size)],maxlen =4)
    observations,stacked_frames = stack_frames(stacked_frames,observations,True)
    while not done:
        action = TrainNet.get_action(observations,0)
        observations, reward, done,_ = env.step(action)
        observations,stacked_frames = stack_frames(stacked_frames,observations,False)
        #env.reset()
        steps += 1
        rewards += reward
    print("Testing steps: {} reward {}: ".format(steps,rewards))

# Main Program

In [20]:
env = gym.make('SpaceInvaders-v0')

gamma = 0.5
copy_step = 50
num_states = len(env.observation_space.sample()) 
num_actions = env.action_space.n
hidden_units = [100,100]
max_experiences = 1000000
min_experiences = 10000
batch_size = 32
lr = 1e-3
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = 'logs/dqn/' + current_time
summary_writer = tf.summary.create_file_writer(log_dir)

TrainNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences,batch_size, lr)
TargetNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences,batch_size, lr)

N = 10000 # number of episode
total_rewards = np.empty(N)
max_epsilon = 1
decay = 0.001
min_epsilon = 0.01
for n in range(N):
    if n%500 == 0:
      TrainNet.save_model()
    epsilon = min_epsilon + (max_epsilon-min_epsilon) * np.exp(-decay*n)
    total_reward, losses = play_game(env,TrainNet,TargetNet,epsilon,copy_step)
    total_rewards[n] = total_reward
    avg_rewards = total_rewards[max(0,n-10):(n+1)].mean()
    with summary_writer.as_default():
        tf.summary.scalar('episode reward', total_reward, step = n)
        tf.summary.scalar('running avg reward(10)', avg_rewards, step =n)
        tf.summary.scalar('average loss', losses, step=n)
    if n%10 ==0:
         print("episode:", n, "episode reward:", total_reward, "eps:", epsilon, "avg reward (last 100):", avg_rewards,
                  "episode loss: ", losses)

        
print("avg reward for last 10 episodes:", avg_rewards)
make_video(env, TrainNet)
env.close()



ValueError: ignored