In [1]:
import gym
import random
import numpy as np
from tensorflow.keras import models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.python.keras.layers.core import Activation
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from tensorflow.python.util import memory

env = gym.make("Acrobot-v1")
states = env.observation_space.shape[0]
actions = env.action_space.n
episodes = 10

for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print(f"Episode: {episode} Score:{score}")

def build_model(states,actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model 

model = build_model(states, actions)
model.summary()

def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

Episode: 1 Score:-500.0
Episode: 2 Score:-500.0
Episode: 3 Score:-496.0
Episode: 4 Score:-500.0
Episode: 5 Score:-500.0
Episode: 6 Score:-500.0
Episode: 7 Score:-500.0
Episode: 8 Score:-500.0
Episode: 9 Score:-500.0
Episode: 10 Score:-500.0
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 6)                 0         
_________________________________________________________________
dense (Dense)                (None, 24)                168       
_________________________________________________________________
dense_1 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 75        
Total params: 843
Trainable params: 843
Non-trainable params: 0
_________________________________________________________________


In [2]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=100000, visualize=False, verbose=1)

2022-01-06 23:47:51.899714: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Training for 50000 steps ...
Interval 1 (0 steps performed)
   24/10000 [..............................] - ETA: 1:27 - reward: -1.0000



26 episodes - episode_reward: -373.077 [-500.000, -157.000] - loss: 0.858 - mae: 19.313 - mean_q: -28.527

Interval 2 (10000 steps performed)
45 episodes - episode_reward: -224.978 [-500.000, -142.000] - loss: 1.593 - mae: 29.591 - mean_q: -43.471

Interval 3 (20000 steps performed)
52 episodes - episode_reward: -190.135 [-333.000, -126.000] - loss: 1.512 - mae: 28.810 - mean_q: -42.199

Interval 4 (30000 steps performed)
55 episodes - episode_reward: -182.818 [-316.000, -117.000] - loss: 1.291 - mae: 27.399 - mean_q: -40.034

Interval 5 (40000 steps performed)
done, took 209.232 seconds


<keras.callbacks.History at 0x17769ea00>

In [4]:
scores = dqn.test(env, nb_episodes=10, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 200.000, steps: 200
Episode 14: reward: 200.000, steps: 200
Episode 15: reward: 200.000, steps: 200
Episode 16: reward: 200.000, steps: 200
Episode 17: reward: 200.000, steps: 200
Episode 18: reward: 200.000, steps: 200
Episode 19: reward: 200.000, steps: 200
Episode 20: reward: 170.000, steps: 170
Episode 21: reward: 200.000, steps: 200
Episode 22: reward: 200.000, steps: 200
Episode 23: reward: 200.000, steps: 200
Episode 24: reward: 200.000, steps: 200
Episode 25: reward: 

In [5]:
dqn.save_weights('dqn_weights.h5f', overwrite=True)

In [6]:
del model
del dqn
del env

In [9]:
env = gym.make('CartPole-v0')
actions = env.action_space.n
states = env.observation_space.shape[0]
model = build_model(states, actions)
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [10]:
dqn.load_weights('dqn_weights.h5f')

In [11]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200


<keras.callbacks.History at 0x178240340>