In [1]:
import numpy as np 
import random 

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam 
from gym import Env 
from gym.spaces import Discrete, Box
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [2]:
class ShowerEnv(Env):
    def __init__(self):
        self.action_space = Discrete(3)
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        self.state = 38 + random.randint(-3, 3)
        self.shower_length = 60 
        
    def step(self, action):
        self.state += action - 1 
        self.shower_length -= 1 
        
        if self.state >=37 and self.state <= 39:
            reward = 1 
        else:
            reward = -1 
        
        if self.shower_length <=0:
            done = True 
        else:
            done = False 
        
        info = {}
        return self.state, reward, done, info 
    
    def render(self):
        pass 
    
    def reset(self):
        self.state = 38 + random.randint(-3, 3)
        self.shower_length = 60 
        return self.state

In [3]:
env = ShowerEnv()



In [4]:
env.action_space.sample()

1

In [5]:
env.observation_space.sample()

array([71.777504], dtype=float32)

In [6]:
episodes = 10 
for episode in range(1, episodes + 1):
    state = env.reset()
#     terminated = False
#     truncated = False
    done = False
    score = 0
    
    while not done:
#         env.render() 
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)  # new version needs five ouputs
        score += reward
        
    print('Episode:{} Score:{}'.format(episode, score))
# env.close()

Episode:1 Score:-24
Episode:2 Score:-56
Episode:3 Score:-60
Episode:4 Score:6
Episode:5 Score:-48
Episode:6 Score:-30
Episode:7 Score:-28
Episode:8 Score:-58
Episode:9 Score:14
Episode:10 Score:-20


In [7]:
states = env.observation_space.shape 
actions = env.action_space.n 

In [8]:
states, actions

((1,), 3)

In [9]:
def build_model(states, actions):
    model = Sequential() 
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model 

In [10]:
model = build_model(states, actions)

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 24)                48        
                                                                 
 dense_1 (Dense)             (None, 24)                600       
                                                                 
 dense_2 (Dense)             (None, 3)                 75        
                                                                 
Total params: 723
Trainable params: 723
Non-trainable params: 0
_________________________________________________________________


In [12]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy() 
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                   nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn 

In [13]:
dqn = build_agent(model, actions)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
    1/10000 [..............................] - ETA: 11:28 - reward: -1.0000

  updates=self.state_updates,


166 episodes - episode_reward: -18.867 [-60.000, 36.000] - loss: 3.363 - mae: 3.940 - mean_q: -3.670

Interval 2 (10000 steps performed)
167 episodes - episode_reward: -52.228 [-60.000, 32.000] - loss: 3777.088 - mae: 208.726 - mean_q: 326.979

Interval 3 (20000 steps performed)
167 episodes - episode_reward: -36.036 [-60.000, 44.000] - loss: 5099.691 - mae: 415.355 - mean_q: 629.676

Interval 4 (30000 steps performed)
166 episodes - episode_reward: -54.289 [-60.000, -4.000] - loss: 913.950 - mae: 192.217 - mean_q: 288.916

Interval 5 (40000 steps performed)
done, took 222.771 seconds


<keras.callbacks.History at 0x2241b16da20>

In [14]:
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))
env.close()

Testing for 100 episodes ...
Episode 1: reward: -58.000, steps: 60
Episode 2: reward: -58.000, steps: 60
Episode 3: reward: -60.000, steps: 60
Episode 4: reward: -56.000, steps: 60
Episode 5: reward: -54.000, steps: 60
Episode 6: reward: -58.000, steps: 60
Episode 7: reward: -60.000, steps: 60
Episode 8: reward: -58.000, steps: 60
Episode 9: reward: -58.000, steps: 60
Episode 10: reward: -58.000, steps: 60
Episode 11: reward: -54.000, steps: 60
Episode 12: reward: -60.000, steps: 60
Episode 13: reward: -56.000, steps: 60
Episode 14: reward: -56.000, steps: 60
Episode 15: reward: -58.000, steps: 60
Episode 16: reward: -60.000, steps: 60
Episode 17: reward: -54.000, steps: 60
Episode 18: reward: -56.000, steps: 60
Episode 19: reward: -60.000, steps: 60
Episode 20: reward: -60.000, steps: 60
Episode 21: reward: -56.000, steps: 60
Episode 22: reward: -60.000, steps: 60
Episode 23: reward: -54.000, steps: 60
Episode 24: reward: -54.000, steps: 60
Episode 25: reward: -60.000, steps: 60
Episo