In [1]:
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras.models import clone_model

from dm_control import suite
from dm_control import viewer
import time

from matplotlib import pyplot as plt

In [2]:
def convert_observation(obs):
    list1 = [i.tolist() for i in list(obs.values())]
    list2 = np.array([])
    for sublist in list1:
        list2 = np.append(list2, sublist)
    return list2.reshape([1,25])

In [3]:
class OrnsteinUhlenbeckActionNoise:
    def __init__(self, mu, sigma=0.3, theta=.15, dt=1e-3, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):
        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)

In [7]:
tf.keras.backend.set_floatx('float64')

class Agent:
    def __init__(self, action_spec):
        self.memory = {
            "state":np.array([]), 
            "action":np.array([], dtype = int), 
            "reward":np.array([]),
            "new_state":np.array([]), 
            "done":np.array([])
        }
        self.batch_size=64
        self.discount_rate=0.99
        self.tau=0.001
        self.action_dim = len(action_spec.minimum) #action_spec.shape
        self.action_spec = action_spec
        self.action_bound = 1
        self.input_size = 25
        self.noise = 0
        
        self.critic_opt = tf.optimizers.Adam(1e-4)
        self.actor_opt = tf.optimizers.Adam(1e-4)
        

        # create critic model:
        input_obs = tf.keras.Input(shape=self.input_size, name = 'state')
        input_actions = tf.keras.Input(shape=(self.action_dim,), name = 'action')
        h = tf.keras.layers.Dense(400, activation = 'relu', W_init=tf.keras.initializers.glorot_normal())(input_obs)
        tmp1 = tf.keras.layers.Dense(300, activation = 'relu', W_init=tf.keras.initializers.glorot_normal())(h)
        action_abs = tf.keras.layers.Dense(300, activation = 'relu', W_init=tf.keras.initializers.glorot_normal()
                                          )(input_actions)
        h = tf.keras.layers.Add()([tmp1,action_abs])
        h = tf.keras.layers.Activation('relu')(h)
        pred = tf.keras.layers.Dense(1, W_init=tf.keras.initializers.glorot_normal())(h)
        model = tf.keras.Model(inputs=[input_obs, input_actions], outputs=pred)   
        model.compile(loss='mse', optimizer=self.critic_opt)
        self.critic_model = model

        """
        input_obs = tf.keras.Input(shape=self.input_size, name = 'state')
        input_actions = tf.keras.Input(shape=(self.action_dim,), name = 'action')
        h = tf.keras.layers.Dense(64, activation = 'relu')(input_obs)
        #h = BatchNormalization()(h)
        tmp1 = tf.keras.layers.Dense(64)(h)
        action_abs = tf.keras.layers.Dense(64, activation = 'relu')(input_actions)
        #action_abs = Activation('relu')(action_abs)
        #action_abs = BatchNormalization()(action_abs)
        h = tf.keras.layers.Add()([tmp1,action_abs])
        #h = Dense(64)(h)
        h = tf.keras.layers.Activation('relu')(h)
        #h = BatchNormalization()(h)
        pred = tf.keras.layers.Dense(1, kernel_initializer='random_uniform')(h)
        model = tf.keras.Model(inputs=[input_obs, input_actions], outputs=pred)
        model.compile(loss='mse', optimizer='Adam')
        self.critic_model = model
        """
        # create actor model:
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Dense(300, input_dim = self.input_size, activation='relu', 
                                        W_init=tf.keras.initializers.glorot_normal()))
        model.add(tf.keras.layers.Dense(200, activation='relu',
                                       W_init=tf.keras.initializers.glorot_normal()))
        model.add(tf.keras.layers.Dense(self.action_dim, activation = "tanh",
                                       W_init=tf.keras.initializers.glorot_normal()))
        model.add(tf.keras.layers.Lambda(lambda x: tf.clip_by_value(x, -1, 1)))
        model.compile(loss='mse', optimizer=self.actor_opt)
        self.actor_model = model

        
        """
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Dense(400, input_dim = self.input_size))#, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Activation('relu'))
        model.add(tf.keras.layers.Dense(300))
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Activation('relu'))
        model.add(tf.keras.layers.Dense(self.action_dim, activation = "tanh"))
        model.compile(loss='mse', optimizer="Adam")
        self.actor_model = model
        """
        # target models:
        self.target_critic_model = clone_model(self.critic_model)
        self.target_actor_model = clone_model(self.actor_model)
 
    def sample_action(self, obs):
        action = self.actor_model.predict(obs)
        action = action + 0.1*np.random.normal(0, 1, self.action_spec.shape) #2*self.noise()# np.random.normal(0, 0.3, self.action_spec.shape) # noise scaled from -1 to 1 
        action = np.clip(action, -1, 1)
        return action
    
    def init_noise(self):
        self.noise = OrnsteinUhlenbeckActionNoise(mu = np.zeros(self.action_dim))
    
    def store_info(self, obs, new_obs, action, reward, done):
        if len(self.memory["state"])>0:
            if self.memory["done"].shape[0]>1e6:
                for key in self.memory.keys():
                    self.memory[key] = self.memory[key][-int(0.9e6):]
            self.memory["state"] = np.vstack((self.memory["state"], obs))
            self.memory["new_state"] = np.vstack((self.memory["new_state"], new_obs))
            self.memory["action"] = np.vstack((self.memory["action"], action))
        else:            
            self.memory["state"] = np.array(obs)
            self.memory["new_state"] = np.array(new_obs)
            self.memory["action"] = np.array(action)
        self.memory["reward"] = np.append(self.memory["reward"], reward)
        self.memory["done"] = np.append(self.memory["done"], done)        
        
        
    def experience_replay(self):
        if self.memory["done"].shape[0]>=self.batch_size:
            # get batch
            idxs = np.random.randint(self.memory["done"].shape[0], size=self.batch_size)
            batch = {
                "state": np.squeeze(self.memory["state"][idxs]), 
                "action": np.squeeze(self.memory["action"][idxs]), 
                "reward": self.memory["reward"][idxs], 
                "new_state": np.squeeze(self.memory["new_state"][idxs]), 
                "done": self.memory["done"][idxs]
            }
            
            # replay:
            #target_q = self.target_critic_model.predict_on_batch({"state":batch["new_state"], 
            #                                "action": self.target_actor_model.predict_on_batch(batch["new_state"])})
            target_q = self.target_critic_model({"state":batch["new_state"], 
                                            "action": self.target_actor_model(batch["new_state"])})

            #y = batch["reward"].reshape(self.batch_size,1) + self.discount_rate*target_q
            y = batch["reward"].reshape(self.batch_size,1) + np.multiply(target_q, 
                                                                        (self.discount_rate*(1-batch["done"])
                                                                        ).reshape(self.batch_size,1)
                                                                        ) 
            
            # update critic
            #self.critic_model.train_on_batch({"state": batch["state"], "action": batch["action"]}, y)  
            with tf.GradientTape() as tape:
                q = self.critic_model({"state": batch["state"], "action": batch["action"]})
                td_error = tf.losses.mean_squared_error(y, q)
            critic_grads = tape.gradient(td_error, self.critic_model.trainable_weights)
            self.critic_opt.apply_gradients(zip(critic_grads, self.critic_model.trainable_weights))
                
            
            # update actor policy
            #actions_pred = self.actor_model.predict_on_batch(batch["state"])
            #critic_pred = self.critic_model.predict_on_batch({"state": batch["state"], "action": actions_pred})
            with tf.GradientTape() as tape:
                actions_pred = self.actor_model(batch["state"])
                q = self.critic_model({"state": batch["state"], "action": actions_pred})
                actor_loss = - tf.reduce_mean(q)
            actor_grads = tape.gradient(actor_loss, self.actor_model.trainable_weights)
            self.actor_opt.apply_gradients(zip(actor_grads, self.actor_model.trainable_weights))
            #self.actor_model.train_on_batch(batch["state"], critic_pred)    
            
            # update networks
            self.critic_model.set_weights(self.tau * np.array(self.target_critic_model.get_weights()) + \
                                          (1.0-self.tau)*np.array(self.critic_model.get_weights()))
            self.actor_model.set_weights(self.tau * np.array(self.target_actor_model.get_weights()) + \
                                         (1.0-self.tau)*np.array(self.actor_model.get_weights()))


In [5]:
def save_models(agent):
    agent.critic_model.save('critic_model.h5')
    agent.actor_model.save('actor_model.h5')
    agent.target_critic_model.save('critic_model.h5')
    agent.target_actor_model.save('actor_model.h5')
    
def train_model(env, n_iterations, batch_size, discount_rate):
    scores = np.array([])
    last_rewards = np.array([])
    first_rewards = np.array([])
    action_spec = env.action_spec()
    agent = Agent(action_spec)
    start = time.time()
    for iteration in range(n_iterations):
        agent.init_noise()
        
        if iteration % 10 == 0:
            save_models(agent)

        time_step = env.reset()  
        obs = convert_observation(time_step.observation)

        game_score = 0
        while not time_step.last():       
            # get action:
            action = agent.sample_action(obs)
            # make action:
            time_step = env.step(action[0])
            new_obs = convert_observation(time_step.observation)            
            # update history:
            game_score += time_step.reward 
            agent.store_info(obs, new_obs, action, time_step.reward, 0)
            obs = new_obs
            # experience replay:
            agent.experience_replay()
            if len(first_rewards)==len(last_rewards):
                first_rewards = np.append(first_rewards, time_step.reward)

        new_obs = convert_observation(time_step.observation)   
        agent.store_info(obs, new_obs, action, time_step.reward, 1)
        agent.experience_replay()
        game_score += time_step.reward 
        
        scores = np.append(scores, game_score)
        last_rewards = np.append(last_rewards, time_step.reward)
        print("Iteration: {}; score: {:10.3f}; last_reward: {:10.3f}; first_reward: {:10.3f}".format(
            iteration, game_score, time_step.reward, first_rewards[-1]))
        print("{:10.3f} minutes remaining".format((time.time()-start)/60))

    save_models(agent)
    return scores, last_rewards, agent


In [None]:
n_iterations = 1000
env = suite.load(domain_name="swimmer", task_name="swimmer6")
action_spec = env.action_spec()

scores, last_rewards, agent = train_model(env, n_iterations, batch_size=64, discount_rate=0.99)

Iteration: 0; score:     17.885; last_reward:      0.022; first_reward:      0.016
     1.025 minutes remaining
Iteration: 1; score:    535.337; last_reward:      0.506; first_reward:      0.380
     2.098 minutes remaining
Iteration: 2; score:     12.737; last_reward:      0.014; first_reward:      0.010
     3.170 minutes remaining
Iteration: 3; score:     17.691; last_reward:      0.017; first_reward:      0.021
     4.193 minutes remaining
Iteration: 4; score:     27.361; last_reward:      0.024; first_reward:      0.027
     5.200 minutes remaining
Iteration: 5; score:     20.472; last_reward:      0.020; first_reward:      0.029
     6.223 minutes remaining
Iteration: 6; score:      7.794; last_reward:      0.008; first_reward:      0.007
     7.276 minutes remaining
Iteration: 7; score:      4.897; last_reward:      0.005; first_reward:      0.005
     8.368 minutes remaining
Iteration: 8; score:      4.462; last_reward:      0.005; first_reward:      0.004
     9.485 minutes re

In [None]:
print(scores[-15:])
print(last_rewards[-15:])

In [None]:
plt.figure(figsize=(15,7))
plt.grid()
x=list(range(len(scores)))
plt.plot(x, scores)

plt.title("Learning curves: score per iteration")
plt.ylabel("Reward")
plt.xlabel("Iteration number")
plt.savefig("learning_curves.png")
plt.show()

In [None]:
test_games = 100

test_scores = []
test_last_rewards = []
for game_index in range(test_games):
    score = 0
    time_step = env.reset()  
    obs = convert_observation(time_step.observation)
    while not time_step.last(): 
        action = agent.target_actor_model.predict(obs)
        time_step = env.step(action[0])
        obs = convert_observation(time_step.observation)
        score += time_step.reward

    test_scores.append(score)
    test_last_rewards.append(time_step.reward)

print("Average reward on test 100 games: ", np.mean(test_scores))

fig, ax = plt.subplots(figsize=(10,7))
ax.set_title('Reward on test 100 games')
ax.boxplot(test_scores,   
          showfliers=True)

ax.set_ylabel("Reward")
plt.savefig("rewards.png")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,7))
ax.set_title('Rewards on the last step on test 100 games')
ax.boxplot(test_last_rewards,   
          showfliers=True)

ax.set_ylabel("Reward on the last step")
plt.savefig("last_rewards.png")
plt.show()