In [1]:
!pip install stable-baselines3[extra]

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.2.1-py3-none-any.whl (181 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.7/181.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gymnasium<0.30,>=0.28.1 (from stable-baselines3[extra])
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting shimmy[atari]~=1.3.0 (from stable-baselines3[extra])
  Downloading Shimmy-1.3.0-py3-none-any.whl (37 kB)
Collecting autorom[accept-rom-license]~=0.6.1 (from stable-baselines3[extra])
  Downloading AutoROM-0.6.1-py3-none-any.whl (9.4 kB)
Collecting AutoROM.accept-rom-license (from autorom[accept-rom-license]~=0.6.1->stable-baselines3[extra])
  Downloading AutoROM.accept-rom-license-0.6.1.tar.gz (434 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.7/434.7 kB[0m [31m32.8 MB/s[0m eta [

In [2]:
import gymnasium as gym
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

In [27]:
class PizzaOven(Env):
  def __init__(self):
    #3 actions; turn up, stay, turn down
    self.action_space = Discrete(3)
    #Temperature array range
    self.observation_space = Box(low = np.array([0]), high = np.array([350]))
    #Set starting temp
    self.state = 210 + random.randint(-10,10)
    #Assume that pizza need 15 minutes to cook
    self.cooking_length = 20

  def step(self, action):
   # Apply action
   # 0 -1 = -1 temperature
   # 1 -1 = 0
   # 2 -1 = 1 temperature
      self.state += action - 1
  #Reduce current cooking length by 1 minute
      self.cooking_length -= 1

  #Calculating reward
      if self.state >= 210 and self.state <= 225:
        reward = 1
      else:
        reward = -1

  #Checking if the cooking is done
      if self.cooking_length <= 0:
        done = True
      else:
        done = False

      info = {}
  #Return step information
      return self.state, reward, done, info

  def render(self):
  #Implement vizualization
      pass

  def reset(self):
    #Reset coooking temp
      self.state = np.array([210 + random.randint(-10,10)]).astype(float)
   #Reset cooking length
      self.cooking_length = 20
      return self.state


In [28]:
env = PizzaOven()

In [29]:
env.observation_space.sample()

array([193.98349], dtype=float32)

In [30]:
env.reset()

array([218.])

In [31]:
episodes = 5
for episode in range(0, episodes):
  state = env.reset()
  done = False
  score = 0

  while not done:
    env.render()
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    score = score + reward
  print('Episode:{} Score:{}'.format(episode,score))

env.close()

Episode:0 Score:20
Episode:1 Score:-18
Episode:2 Score:-20
Episode:3 Score:-20
Episode:4 Score:20


In [32]:
log_path = os.path.join('Desktop')

In [33]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [34]:
model.learn(total_timesteps= 200000)

Logging to Desktop/PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 20       |
|    ep_rew_mean     | -0.76    |
| time/              |          |
|    fps             | 1159     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 20          |
|    ep_rew_mean          | 5.56        |
| time/                   |             |
|    fps                  | 818         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.012781357 |
|    clip_fraction        | 0.102       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -0.00029    |
|    

<stable_baselines3.ppo.ppo.PPO at 0x7a3cd7193460>

In [37]:
path = os.path.join('Saved Models', 'Cooking_Pizza_200K')

In [38]:
model.save(path)

In [39]:
evaluate_policy(model, env, n_eval_episodes=10, render=False)

(3.2, 8.304215796810677)