# Set up

In [18]:
!pip install tensorflow==2.3.1 
# gym keras-rl2 gym[atari]



In [2]:
import gym
import random 
from pathlib import Path

# # Local imports
from utils import eval_env_random_actions

# 1. Set up

In [3]:
# Create folder to save models
directory_path = 'models'
Path(directory_path).mkdir(parents=True, exist_ok=True)

# Create environment
env_name = 'SpaceInvaders-v0'
env = gym.make(env_name)


model_file_name = Path(directory_path, env_name + '_DL')
print(env.action_space.n)
print(env.action_space)
# Discrete(6)
# print(env.observation_space)
# box

height, width, channels = env.observation_space.shape
actions = env.action_space.n

6
Discrete(6)


In [4]:
env.unwrapped.get_action_meanings()

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']

## Random

In [5]:
random_mean = eval_env_random_actions(env, render=True)

2022-03-26 12:34:33.931 Python[13037:3702464] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to (null)


Episode: 1
	Score: 80.0
Episode: 2
	Score: 155.0
Episode: 3
	Score: 55.0
Episode: 4
	Score: 265.0
Episode: 5
	Score: 120.0
Episode: 6
	Score: 410.0
Episode: 7
	Score: 50.0
Episode: 8
	Score: 125.0
Episode: 9
	Score: 140.0


	Mean reward: 155.55555555555554 Num episodes: 10


# 2. Build a Deep Learning Model with Keras

In [8]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Convolution2D
from tensorflow.keras.optimizers import Adam

In [9]:
def build_model(height, width, channels, actions):
    model = Sequential()
    model.add(Convolution2D(32, (8,8), strides=(4,4), activation='relu', 
                            input_shape=(3, height, width, channels)))
    model.add(Convolution2D(64, (4,4), strides=(2,2), activation='relu'))
    model.add(Convolution2D(64, (3,3), activation='relu'))
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [10]:
model = build_model(height, width, channels, actions)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 3, 51, 39, 32)     6176      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 3, 24, 18, 64)     32832     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 3, 22, 16, 64)     36928     
_________________________________________________________________
flatten (Flatten)            (None, 67584)             0         
_________________________________________________________________
dense (Dense)                (None, 512)               34603520  
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 1

# 4 Build an agent

In [11]:
from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy

In [12]:
def build_agent(model, actions):
    policy = LinearAnnealedPolicy(
        EpsGreedyQPolicy(), 
        attr='eps', value_max=1., value_min=.1, 
        value_test=.2, nb_steps=10_000)
    
    memory = SequentialMemory(limit=1_000, window_length=3)
    
    dqn = DQNAgent(
        model=model, 
        memory=memory, 
        policy=policy,
        enable_dueling_network=True, 
        dueling_type='avg', 
        nb_actions=actions, 
        nb_steps_warmup=1_000
    )
    return dqn

dqn = build_agent(model, actions)

In [13]:
dqn.compile(Adam(lr=1e-4))

2022-03-26 12:38:48.216096: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-26 12:38:48.296372: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fd70f5f7570 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-03-26 12:38:48.296396: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


## 4 Train the agent

In [29]:
dqn.fit(env, nb_steps=10_000, visualize=False, verbose=2) #10_000_000 

Training for 10000 steps ...
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
  593/10000: episode: 1, duration: 18.915s, episode steps: 593, steps per second:  31, episode reward: 45.000, mean reward:  0.076 [ 0.000, 15.000], mean action: 2.516 [0.000, 5.000],  loss: --, mean_q: --, mean_eps: --
 1395/10000: episode: 2, duration: 995.013s, episode steps: 802, steps per second:   1, episode reward: 185.000, mean reward:  0.231 [ 0.000, 30.000], mean action: 2.635 [0.000, 5.000],  loss: 10.350943, mean_q: -0.115048, mean_eps: 0.892225
 2040/10000: episode: 3, duration: 1044.005s, episode steps: 645, steps per second:   1, episode reward: 80.000, mean reward:  0.124 [ 0.000, 20.000], mean action: 2.374 [0.000, 5.000],  loss: 1.021924, mean_q: 0.388921, mean_eps: 0.845470
 2445/10000: episode: 4, duration: 619.841s, episode steps: 405, steps per second:   1, episode reward: 105.000, mean reward:  0.259 [ 0.000, 30.000], m

<tensorflow.python.keras.callbacks.History at 0x7ff123a960d0>

# 4. Evaluate

In [30]:
scores = dqn.test(env, nb_episodes=10, visualize=False)
mean_reward = np.mean(scores.history['episode_reward'])
print(f"mean_reward:{mean_reward:.2f}")

Testing for 10 episodes ...


2022-03-25 09:58:50.150 Python[8188:3529268] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to (null)


Episode 1: reward: 20.000, steps: 445
Episode 2: reward: 135.000, steps: 671
Episode 3: reward: 270.000, steps: 1114
Episode 4: reward: 195.000, steps: 941
Episode 5: reward: 100.000, steps: 758
Episode 6: reward: 160.000, steps: 986
Episode 7: reward: 230.000, steps: 1074
Episode 8: reward: 145.000, steps: 940
Episode 9: reward: 165.000, steps: 698
Episode 10: reward: 20.000, steps: 563
mean_reward:144.00


In [31]:
print(f"Improvement respect random actions: {mean_reward*100/random_mean-100:.2f}%")

Improvement respect random actions: 31.57%


In [36]:
dqn.save_weights(Path(directory_path, env_name + '_DL_h5f') )


In [48]:
del model, dqn

In [14]:
model = build_model(height, width, channels, actions)
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4))
dqn.load_weights(Path(directory_path, env_name + '_DL_h5f'))

In [None]:
scores = dqn.test(env, nb_episodes=10, visualize=True)
mean_reward = np.mean(scores.history['episode_reward'])
print(f"mean_reward:{mean_reward:.2f}")


Testing for 10 episodes ...
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Episode 1: reward: 40.000, steps: 551
