In [1]:
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random

In [2]:
class ShowerEnv(Env):
    def __init__(self):
        self.action_space = Discrete(3)
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        self.state = 38 + random.randint(-3, 3)
        self.shower_length = 60

    def step(self, action):
        self.state += action - 1
        self.shower_length -= 1

        if self.state >= 37 and self.state <= 39:
            reward = 1
        else:
            reward = 0

        if self.shower_length <= 0:
            done = True
        else:
            done = False

        self.state += random.randint(-1, 1)
        info = {}

        return self.state, reward, done, info

    def render(self):
        pass

    def reset(self):
        self.state = 38 + random.randint(-3, 3)
        self.shower_length = 60
        return self.state

In [3]:
env = ShowerEnv()

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [4]:
env.action_space.sample()

0

In [5]:
env.state

36

In [6]:
env.observation_space.sample()

array([43.555527], dtype=float32)

In [7]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action=action)
        score += reward
    print(f"Episode: {episode} | Score: {score}")

Episode: 1 | Score: 18
Episode: 2 | Score: 29
Episode: 3 | Score: 16
Episode: 4 | Score: 3
Episode: 5 | Score: 35
Episode: 6 | Score: 13
Episode: 7 | Score: 5
Episode: 8 | Score: 39
Episode: 9 | Score: 29
Episode: 10 | Score: 17


In [20]:
from stable_baselines3 import DQN

In [22]:
model = DQN("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=1000, log_interval=4)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




----------------------------------
| rollout/            |          |
|    ep_len_mean      | 60       |
|    ep_rew_mean      | 12.5     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 646      |
|    time_elapsed     | 0        |
|    total_timesteps  | 240      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0765   |
|    n_updates        | 34       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 60       |
|    ep_rew_mean      | 9.38     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 656      |
|    time_elapsed     | 0        |
|    total_timesteps  | 480      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0512   |
|    n_updates      

<stable_baselines3.dqn.dqn.DQN at 0x1871d7f1ca0>

In [36]:
obs = env.reset()
obs = np.array([obs])
while True:
    action, _states = model.predict(obs)
    obs, reward, done, info = env.step(action)
    obs = np.array([next_obs])
    if done:
        obs = env.reset()
        obs = np.array([obs])
        break

In [37]:
print(action), print(obs)

[1]
[41]


(None, None)

In [3]:
# -*- coding: utf-8 -*-
import numpy as np
import math

np.random.seed(0)

# Create random input and output data
x = np.linspace(-math.pi, math.pi, 2000)
y = np.sin(x)

# Randomly initialize weights
a = np.random.randn()
b = np.random.randn()
c = np.random.randn()
d = np.random.randn()

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    # y = a + b x + c x^2 + d x^3
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')

99 1942.5981197543472
199 1357.8620988723726
299 950.6459714997519
399 666.8234472515073
499 468.84881983428795
599 330.65138961169055
699 234.11186628462443
799 166.62610469569785
899 119.4189117419541
999 86.37582610337255
1099 63.23296761101651
1199 47.01467134962145
1299 35.642761530587755
1399 27.664836288357098
1499 22.06515054558043
1599 18.1328781870579
1699 15.370270373477139
1799 13.428577985505331
1899 12.06331164944789
1999 11.10298121439451
Result: y = 0.04878583756373356 + 0.8443864408312384 x + -0.008416368779585175 x^2 + -0.09157308290467793 x^3
