In [10]:
import gym
from gym import spaces, utils
import pickle
import random
import numpy as np
!rm requirement.txt
!echo -e "ale-py==0.7.5 \n gym-notices==0.0.7 \n ipykernel==6.15.1 \n opencv-python==4.6.0.66 \n pip-chill==1.0.1 \n pygame==2.1.0 \n sklearn==0.0 \n stable-baselines3==1.6.0 \ntensorflow==2.9.1 \n wincertstore==0.2" >> requirement.txt
!pip install -r requirement.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Production Env

In [11]:
env = pickle.load(open("model.pkl", 'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [12]:
env.predict([[600,600,600,600,600,600,600,600,600,175],
             [600,610,600,600,600,600,600,590,600,175],
             [600,600,600,600,600,600,630,600,600,185]])

array([597.80041667, 598.34933333, 600.06925   ])

### Gym Environment

In [13]:
class HeatingFurnace(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self):
        """
        env initialization
        """
        self.MAX_TEMP = 700
        self.MIN_TEMP = 500
        self.MAX_SPEED = 200
        self.MIN_SPEED = 150
        self.ADJUSTMENT_STEP = 10
        self.NUM_ZONE = 9
        self.TARGET = 685
        self.SPEED_CONSUMPTION = 0.005
        self.TEMP_CONSUMPTION = 0.01
        self.STEP_PENALTY = -0.005

        # smooth convergence
        self.MAX_COST = self.MAX_TEMP*self.NUM_ZONE*self.TEMP_CONSUMPTION + \
                        self.MAX_SPEED*self.SPEED_CONSUMPTION

        self.env = None # env model
        self.obs = None # observation [the results of the temp and speed]
        self.model_path = 'model.pkl'
        self.steps = 0 # number of steps taken
        self.total_steps = 0 # total number of steps taken
        self.episodes = 0 # number of episodes
        self.cumulative_reward = 0 # cumulative reward
        self.episode_reward = 0 # reward for one episode
        self.cost_list = [] # record for cost after episode
        
        # 20 discrete actions to either increase or decrease speed or zone temp
        # 0-9 increase - 9 zones + 1 speed adjustment
        # 10-19 decrease
        self.action_space = spaces.Discrete((self.NUM_ZONE+1)*2)
        
        # continuous observation space including settings and output temp
        low = [self.MIN_TEMP]*self.NUM_ZONE # define low range for observation space
        low.append(self.MIN_SPEED) # speed
        low.append(0) # output temp
        
        high = [self.MAX_TEMP]*self.NUM_ZONE # define high range for observation space
        high.append(self.MAX_SPEED) # speed
        high.append(self.MAX_TEMP) # output temp
        
        # define observation space with low and high boundary
        self.observation_space = spaces.Box(
            low=np.array(np.float16(low)),
            high=np.array(np.float16(high)),
            dtype=np.float16)

    # step to take action
    def step(self, action):
        '''
        for each step, take action
        input:
            action: (int) discrete action value
        return:
            obs: (ndarray) observation
            reward: (float) reward from step
            last: (bool) end episode signal
            {}: (dict) debug use
        '''
        self.obs, reward, last = self.take_action(action) # take action
        self.steps += 1 # increase episode step
        self.total_steps += 1 # increase total training step
        # if self.steps%1000 == 0:
        #     print(f'step: {self.steps}')
        #     print(f'cumulative reward: {self.cumulative_reward}')
        #     print(f'avg reward per step: {self.cumulative_reward/self.steps}')
        #     print(self.obs.tolist())
        return self.obs, reward, last, {}


    # reset environment [after ea episode reset env and update settings]
    def reset(self):
        '''
        reset environment
        return:
            self.obs: (ndarray) observation
        '''
        if self.env is None: # initialize environment
            self.env = self.init_env()
        else: # log to terminal when reset episode
            print("=================================")
            print(f"episode summary: ")
            print(f"num episode: {self.episodes}")
            print(f"num steps: {self.steps}")
            print(f"episode reward: {self.episode_reward}")
            print(f"per step reward: {self.episode_reward/self.steps}")
            print(f"setting: {self.obs.tolist()}")
            print(f"setting cost: {self.cost_list[-1]}")
            print("=================================\n")

        # get initial observation after reset
        setting = [] # reset setting
        setting.extend(sorted(random.choices(range(self.MIN_TEMP, self.MAX_TEMP+1), k=self.NUM_ZONE))) # random init zone temp
        setting.append(random.randrange(self.MIN_SPEED, self.MAX_SPEED+1)) # random init speed
        self.obs = self.get_obs(setting, self.env) # get reset initial observation

        # reset others
        self.steps = 0 # reset episode step
        self.episodes += 1 # increase episode number
        self.episode_reward = 0 # reset episode reward

        # return reset initial observation
        return self.obs


    # initialize env with model
    def init_env(self):
        '''
        load pickle model and initiaze environment
        return:
            env: (sklearn model) sklearn model load from pickle binary file
        '''
        return pickle.load(open(self.model_path, 'rb'))


    def get_obs(self, setting, env):
        '''
        based on the setting, get env observation
        input:
            setting: (list) setting for 9 heating zone and converyor speed
        return:
            obs: (ndarray) observation
        '''
        tval = env.predict([setting])[0] # get pieace temp from env based on setting
        # TO COMPLETE!!!
        # construct obs [contains 11 items, 11 settings: 9 heatzone and 1 belt speed and setting for 1 temp fo metal at the end]
        setting = np.asarray(setting)
        setting = np.append(setting, tval)
        # setting = np.reshape(setting, (-1))
        return (setting)


    def take_action(self, action):
        '''
        take corresponding action based on action input
        input:
            action: (int) action number
        return:
            obs: (dnarray) observation
            reward: (float) reward from step taken
            last: (bool) end episode signal
        '''
        reward = 0 # define step reward
        last = False # define last indicator

        # reward += self.STEP_PENALTY # step penalty
        setting = self.obs[:-1] # get last step settings
        # increase zone temp
        if action < self.NUM_ZONE-1:
            adjusted = setting[action] + self.ADJUSTMENT_STEP # increase temp
            if adjusted >= setting[action+1]: # if greater than next zone
                setting[action] = setting[action+1] # equal to next
            else:
                setting[action] = adjusted # else equal to adjusted value
        # increase last zone temp
        elif action == self.NUM_ZONE-1: # if it is the last zone
            adjusted = setting[action] + self.ADJUSTMENT_STEP # increase temp
            if adjusted > self.MAX_TEMP: # if increased result greater than limit
                setting[action] = self.MAX_TEMP # equal to limit
            else:
                setting[action] = adjusted # equal to adjusted value
        # increase speed
        elif action == self.NUM_ZONE:
            adjusted = setting[action] + self.ADJUSTMENT_STEP # increase speed
            if adjusted > self.MAX_SPEED: # if increased result greater than limit
                setting[action] = self.MAX_SPEED # equal to limit
            else:
                setting[action] = adjusted # equal to adjusted value
                
        # decrease first zone temp
        elif action == self.NUM_ZONE + 1:
           # TO COMPLETE!!!
           adjusted = setting[0] - self.ADJUSTMENT_STEP
           if adjusted < self.MIN_TEMP:
             setting[0] = self.MIN_TEMP
           else:
             setting[0] = adjusted

        
        # decrease zone temp
        elif self.NUM_ZONE+1 < action and action < 19:
            # TO COMPLETE!!!
            adjusted = setting[action - 10] - self.ADJUSTMENT_STEP
            if adjusted < self.MIN_TEMP:
              setting[action - 10] = self.MIN_TEMP
            elif adjusted > self.MAX_TEMP:
              setting[action - 10] = self.MAX_TEMP
            else:
              setting[action - 10] = adjusted
            
        # decrease speed:
        else:
            # TO COMPLETE!!!
            adjusted = setting[9] - self.ADJUSTMENT_STEP
            if adjusted < self.MIN_SPEED:
              setting[9] = self.MIN_SPEED
            else:  
              setting[9] = adjusted


        # get observation based on new setting
        obs = self.get_obs(setting, self.env)

        # calculate reward
        # TO COMPLETE!!!
        prev_tval =  setting[-1]# get previous temp value
        curr_tval =  obs[-1] #  get current temp value
        prev_offset = prev_tval - self.TARGET # get absolute difference from prev temp to target value
        curr_offset =  curr_tval - self.TARGET# get absolute difference from curr temp to target value
        
        # closer to target?
        # if curr offset closer, positive reward & vice versa
        reward += (prev_offset-curr_offset)/self.TARGET

        # current temperature within end episode range
        if self.TARGET-5 <= curr_tval and curr_tval <= self.TARGET+5:
            reward += 1 # add episode reward
            reward += self.evaluate_episode(setting)
            last = True # indicate episode end

        self.episode_reward += reward # increase episode reward
        self.cumulative_reward += reward # increase cumulative reward

        return np.array(obs), reward, last

    def evaluate_episode(self, new_setting):
        s = np.array(new_setting)
        cost = 0
        cost += np.sum(s[:self.NUM_ZONE]*self.TEMP_CONSUMPTION)
        cost += s[-1]*self.SPEED_CONSUMPTION
        # delta = (self.last_cost - cost)
        self.cost_list.append(cost)
        normalized_score = -cost/self.MAX_COST
        return normalized_score


    def set_obs(self, obs):
        self.obs = obs


    def close(self):
        '''
        close environment
        '''
        self.env = None # reset env to None


    def render(self, mode='human'):
        '''
        render environment info
        '''
        if self.steps%1000 == 0:
            print(f'step: {self.steps}')
            print(f'cumulative reward: {self.cumulative_reward}')
            print(f'avg reward per step: {self.cumulative_reward/self.steps}')

### Training

In [14]:
from stable_baselines3 import A2C # RL algorithms # TO COMPLETE!!!
from stable_baselines3.common.env_util import make_vec_env

newEnv = HeatingFurnace()
model = A2C('MlpPolicy', newEnv, verbose=1)
model.learn(total_timesteps=10000)


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
episode summary: 
num episode: 1
num steps: 8
episode reward: -6.015661237834557
per step reward: -0.7519576547293196
setting: [531.0, 537.0, 525.0, 571.0, 587.0, 639.0, 645.0, 679.0, 700.0, 150.0, 686.5966666666669]
setting cost: 54.89

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8        |
|    ep_rew_mean        | -6.02    |
| time/                 |          |
|    fps                | 80       |
|    iterations         | 100      |
|    time_elapsed       | 6        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -2       |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -2.18    |
|    value_loss         | 1.86     |
------------------------------------
-------------------------------------
|

<stable_baselines3.a2c.a2c.A2C at 0x7fcd778071d0>

###### https://stable-baselines.readthedocs.io/en/master/guide/install.html

#### Vectorize customized environment and learn

In [15]:
# TO COMPLETE!!!

In [16]:
vecEnv = make_vec_env(HeatingFurnace, n_envs=4)
model = A2C('MlpPolicy', vecEnv, verbose=1)
model.learn(total_timesteps=25000)

Using cpu device


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


episode summary: 
num episode: 1
num steps: 59
episode reward: -38.5031446624088
per step reward: -0.652595672244217
setting: [542.0, 502.0, 562.0, 555.0, 620.0, 625.0, 615.0, 688.0, 690.0, 200.0, 684.6410833333339]
setting cost: 54.99

episode summary: 
num episode: 1
num steps: 72
episode reward: -53.4884419403893
per step reward: -0.7428950269498514
setting: [583.0, 563.0, 615.0, 620.0, 640.0, 630.0, 666.0, 689.0, 689.0, 150.0, 687.5111666666668]
setting cost: 57.7

episode summary: 
num episode: 2
num steps: 17
episode reward: -12.042835697992697
per step reward: -0.7084020998819234
setting: [510.0, 542.0, 542.0, 552.0, 563.0, 626.0, 626.0, 686.0, 690.0, 171.0, 684.4128333333334]
setting cost: 54.224999999999994

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 49.3      |
|    ep_rew_mean        | -34.7     |
| time/                 |           |
|    fps                | 78        |
|    iterations         | 100       |
|    ti

<stable_baselines3.a2c.a2c.A2C at 0x7fcd02373190>

In [20]:
e = HeatingFurnace()
obs = e.reset()
for i in range(10000):
    action, _state = model.predict(obs)
    obs, reward, done, a = e.step(action)
    e.render()
    print(obs[-1])
    if done:
      obs = e.reset()
      break


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


696.9805000000005
696.9805000000005
696.9805000000005
696.9805000000005
696.9805000000005
696.9805000000005
696.9805000000005
696.4316666666672
696.4316666666672
696.4316666666672
697.0483333333336
697.0483333333336
690.6145000000002
690.6145000000002
690.6145000000002
690.6145000000002
690.6145000000002
687.562916666667
episode summary: 
num episode: 1
num steps: 18
episode reward: -14.185007922749405
per step reward: -0.7880559957083002
setting: [500.0, 558.0, 584.0, 584.0, 604.0, 646.0, 670.0, 697.0, 677.0, 150.0, 687.562916666667]
setting cost: 55.95

