In [2]:
import gymnasium as gym
from gymnasium import spaces
import json
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg

from stable_baselines3 import A2C, PPO, DQN
from stable_baselines3.common.env_checker import check_env

In [3]:
def type_incoding(type):
        type_code = {'A' : 0, 'B' : 1, 'C' : 2, 'D' : 3, 'E': 4, 'F' : 5, 'G' : 6, 'H' : 7, 'I' : 8, 'J' : 9, 'K' : 10, 'L' : 11, 'M' : 12, 'N' : 13, 'O' : 14, 'P' : 15, 'Q' : 16, 'R' : 17, 'S' : 18, 'T' : 19, 'U' : 20, 'V' : 21, 'W' : 22, 'X' : 23, 'Y' : 24, 'Z' : 25}
        return type_code[type]

In [4]:
class Resource():
    def __init__(self, resouces_dictionary):
        self.task_schedule = [] # (tasks)
        self.name = resouces_dictionary['name'] 
        self.ability = self.ability_incoding(resouces_dictionary['ability']) # "A, B, C, ..."
        self.reward = 0

    def __str__(self):
        # str_to_tasks = [str(task) for task in self.task_schedule]
        # return f"{self.name} : {str_to_tasks}"
        return f"{self.name}"
    def ability_incoding(self, ability):
        return [type_incoding(type) for type in ability]
    
    def can_process_task(self, task_type):
        return task_type in self.ability

In [5]:
class Order():
    def __init__(self, order_dictionary):
        self.name = order_dictionary['name']
        self.color = order_dictionary['color']
        self.task_queue = [Task(task_dictionary) for task_dictionary in order_dictionary['tasks']]
        self.reward = 0

In [6]:
class Task():
    def __init__(self, task_dictionary):
        self.sequence = task_dictionary['sequence']
        self.index = task_dictionary['index']
        self.type = type_incoding(task_dictionary['type'])
        self.predecessor = task_dictionary['predecessor']
        self.earliest_start = task_dictionary['earliest_start']
        self.duration = task_dictionary['duration']
        self.start = task_dictionary['start']
        self.finish = task_dictionary['finish']
        self.resource = -1
        self.color = ""
        self.order = -1

    def to_dict(self):
        return {
            'sequence': self.sequence,
            'index' : self.index,
            'type' : self.type,
            'predecessor' : self.predecessor,
            'earliest_start' : self.earliest_start,
            'duration' : self.duration,
            'start': self.start,
            'finish': self.finish,
            'resource': self.resource,
            'color' : self.color,
            'order' : self.order
        }
    def __str__(self):
        return f"order : {self.order}, step : {self.index} | ({self.start}, {self.finish})"

In [18]:
class SchedulingEnv(gym.Env):
    def load_resources(self, file_path):
        resources = []

        with open(file_path, 'r') as file:
            data = json.load(file)

        for resource_data in data["resources"]:
            resource = {}
            resource['name'] = resource_data["name"]
            resource['ability'] = resource_data["type"].split(', ')
            resources.append(resource)

        return resources
    
    def load_orders_new_version(self, file):
        # Just in case we are reloading tasks
        
        orders = [] # 리턴할 용도
        orders_new_version = [] # 파일 읽고 저장할 때 쓰는 용도
        f = open(file)

        # returns JSON object as  a dictionary
        data = json.load(f)
        f.close()
        orders_new_version = data['orders']

        for order in orders_new_version:
            order_dictonary = {}
            # Initial index of steps within order
            order_dictonary['name'] = order['name']
            order_dictonary['color'] = order['color']
            earliestStart = order['earliest_start']

            tasks = []
            for task in order['tasks']:
                predecessor = task['predecessor']
                task_dictionary = {}
                # Sequence is the scheduling order, the series of which defines a State or Node.
                task_dictionary['sequence'] = None
                task_dictionary['index'] = task['index']
                task_dictionary['type'] = task['type']
                if predecessor is None:
                    task_dictionary['predecessor'] = None
                    task_dictionary['earliest_start'] = earliestStart
                else:
                    task_dictionary['predecessor'] = predecessor
                    task_dictionary['earliest_start'] = None
                task_dictionary['duration'] = task['duration']
                task_dictionary['start'] = None
                task_dictionary['finish'] = None

                tasks.append(task_dictionary)
            
            order_dictonary['tasks'] = tasks
            orders.append(order_dictonary)

        return orders

    def __init__(self, resources = "../resources/resources-8.json", orders = "../orders/orders-10.json", render_mode="seaborn"):
        super(SchedulingEnv, self).__init__()

        resources = self.load_resources(resources)
        orders = self.load_orders_new_version(orders)
        self.resources = [Resource(resource_info) for resource_info in resources]
        self.orders = [Order(order_info) for order_info in orders]
        len_resource = len(self.resources)
        len_orders = len(self.orders)
        # Reset 할 때 DeepCopy를 위해 원본을 저장해둠
        self.original_orders = copy.deepcopy(self.orders)
        self.original_resources = copy.deepcopy(self.resources)
        self.original_tasks = copy.deepcopy([order.task_queue for order in self.orders])
        self.num_tasks = sum([len(order.task_queue) for order in self.orders])

        self.schedule_buffer = [-1 for _ in range(len(self.orders))]
        self.state = None
        self.legal_actions = None
        self.action_space = spaces.MultiDiscrete([len_resource, len_orders])
        self.observation_space = spaces.Dict({
            "action_mask": spaces.MultiBinary([len_resource, len_orders]),
            "real_observation": spaces.Box(low=-10, high=5000, shape=(len_orders, 4), dtype=np.float64)
        })
        
        self.current_schedule = []
        self.num_scheduled_tasks = 0
        self.num_steps = 0
        self.invalid_count = 0
        self.last_finish_time = 0

    def reset(self, seed=None, options=None):
        """
        Important: the observation must be a numpy array
        :return: (np.array)
        """
        super().reset(seed=seed, options=options)

        # 환경과 관련된 변수들     
        self.orders = copy.deepcopy(self.original_orders)
        self.resources = copy.deepcopy(self.original_resources)\
        
        # 내부 동작을 위한 변수들
        # self.state에 관한 추가설명 / Order 하나 당 가지는 정보는 아래와 같다
        # 1. 남은 task 수
        # 2. 다음으로 수행할 Task의 Type
        # 3. 다음으로 수행할 Task의 earliest_start
        # 4. 다음으로 수행할 Task의 duration
        self.state = np.zeros((len(self.orders), 4), dtype=np.int32)
        self.legal_actions = np.ones((len(self.resources), len(self.orders)), dtype=bool)       
        self._update_schedule_buffer()
        self._update_state()

        # 기록을 위한 변수들
        self.current_schedule = []
        self.num_scheduled_tasks = 0
        self.num_steps = 0
        self.invalid_count = 0
        self.last_finish_time = 0

        info = {
            'finish_time' : self.last_finish_time,
            'invalid_count' : self.invalid_count,
            'resources_reward' : [resource.reward for resource in self.resources],
            'orders_reward' : [order.reward for order in self.orders],
            'schedule_buffer' : self.schedule_buffer,
            'current_schedule' : self.current_schedule
               }

        return self._get_observation(), info  # empty info dict

    def step(self, action):
        def is_error_action(act):
            return act[0] < 0 or act[1] < 0 or act[0] >= len(self.resources) or act[1] >= len(self.orders)

        if is_error_action(action):
            raise ValueError(
                f"Received invalid action={action} which is not part of the action space"
            )

        # error_action이 아니라면 step의 수를 증가시킨다
        self.num_steps += 1
        reward = -1
        # 현재 아래 업데이트의 문제점 : Resource와 Task의 타입이 맞지 않아 False 처리를 한 이후 다시 True로 바뀔 수 있어야하는데 구현 하지 못했음
        self._update_legal_actions()
        if self.legal_actions[action[0]][action[1]]:
            self._schedule_task(action)
            self._update_schedule_buffer(action[1])
            self._update_state()
            self.last_finish_time = self._get_final_task_finish()
            self._calculate_step_reward(action)
            reward = self._calculate_total_reward()
        else:
            self.invalid_count += 1
            
        # 고로 다시 아래처럼 초기화함
        self.legal_actions = np.ones((len(self.resources), len(self.orders)), dtype=bool)

        # 모든 Order의 Task가 종료된 경우 Terminated를 True로 설정한다
        # 또한 legal_actions가 전부 False인 경우도 Terminated를 True로 설정한다
        terminated = all([order.task_queue[-1].finish is not None for order in self.orders]) or not np.any(self.legal_actions)
        
        if terminated:
            reward += 10000/self._get_final_task_finish()

        # 무한 루프를 방지하기 위한 조건
        truncated = bool(self.num_steps == 10000)

        # Optionally we can pass additional info, we are not using that for now
        info = {
            'finish_time' : self.last_finish_time,
            'invalid_count' : self.invalid_count,
            'resources_reward' : [resource.reward for resource in self.resources],
            'orders_reward' : [order.reward for order in self.orders],
            'schedule_buffer' : self.schedule_buffer,
            'current_schedule' : self.current_schedule
               }

        return (
            self._get_observation(),
            reward,
            terminated,
            truncated,
            info,
        )
    
    def get_action_mask(self):
        return self.legal_actions

    def _update_legal_actions(self):
        for order_index in range(len(self.orders)):
        # 1. 선택된 Order의 모든 Task가 이미 종료된 경우
            if self.schedule_buffer[order_index] < 0:
                self.legal_actions[:, order_index] = False
        
        for resource_index in range(len(self.resources)):
            # 2. 선택된 Resource가 선택된 Order의 Task의 Type을 처리할 수 없는 경우
            resource = self.resources[resource_index]
            for order_index in range(len(self.orders)):
                order = self.orders[order_index]
                task = order.task_queue[self.schedule_buffer[order_index]]
                if not resource.can_process_task(task.type):
                    self.legal_actions[resource_index, order_index] = False

    def _update_state(self):
        # state는 order의 수 * 4의 행렬이다
        # 각 열에는 해당 Order의 Task에 대한 정보가 담겨있다
        # 남은 task 수
        # 다음으로 수행할 Task의 Duration
        # 다음으로 수행할 Task의 Earlist_start
        # 다음으로 수행할 Task의 Type
        for i, order in enumerate(self.orders):
            task_index = self.schedule_buffer[i]
            if task_index < 0:
                self.state[i] = np.zeros(4, dtype=np.int32)
            else:
                task = order.task_queue[task_index]
                self.state[i] = [len(order.task_queue) - task_index, task.duration, task.earliest_start, task.type]

    def _update_schedule_buffer(self, target_order = None):
        # target_order은 매번 모든 Order를 보는 계산량을 줄이기 위해 설정할 변수
        # None은 최초의 호출에서, 또는 Reset이 이뤄질 경우를 위해 존재
        if target_order == None:
            buffer_index = 0
            
            for order in self.orders:
                # Assume order['steps'] is a list of tasks for the current order
                
                selected_task_index = -1
                
                for i in range(len(order.task_queue)):
                    # 아직 스케줄링을 시작하지 않은 Task를 찾는다
                    if order.task_queue[i].finish is None:
                        selected_task_index = i
                        break
                # 스케줄링 하지 않은 Task를 발견했다면        
                if selected_task_index >= 0:
                    selected_task = order.task_queue[selected_task_index]
        
                    # 만약 초기 시작 제한이 없다면 
                    # 초기 시작 제한을 이전 Task의 Finish Time으로 걸어주고 버퍼에 등록한다.
                    if selected_task.earliest_start is None:
                        if selected_task_index > 0:
                            selected_task.earliest_start = order.task_queue[selected_task_index-1].finish
                
                self.schedule_buffer[buffer_index] = selected_task_index
                buffer_index += 1
                
        # Action으로 인해 봐야할 버퍼의 인덱스가 정해짐
        else:
            selected_task_index = -1
            order = self.orders[target_order]
            for i in range(len(order.task_queue)):
                # 아직 스케줄링을 시작하지 않은 Task를 찾는다
                if order.task_queue[i].finish is None:
                    selected_task_index = i
                    break
            if selected_task_index >= 0:
                    selected_task = order.task_queue[selected_task_index]
                    if selected_task.earliest_start is None:
                        if selected_task_index > 0:
                            selected_task.earliest_start = order.task_queue[selected_task_index-1].finish
            
            self.schedule_buffer[target_order] = selected_task_index
        
    def _schedule_task(self, action):
        # Implement the scheduling logic based on the action
        # You need to update the start and finish times of the tasks
        # based on the selected task index (action) and the current state.

        # Example: updating start and finish times
        selected_resource = self.resources[action[0]]
        selected_order = self.orders[action[1]]
        selected_task = selected_order.task_queue[self.schedule_buffer[action[1]]]
        task_earliest_start = selected_task.earliest_start
        task_index = selected_task.index
        task_duration = selected_task.duration
        resource_tasks = sorted(selected_resource.task_schedule, key=lambda task: task.start)

        open_windows = []
        start_window = 0
        last_alloc = 0

        for scheduled_task in resource_tasks:
            resource_init = scheduled_task.start

            if resource_init > start_window:
                open_windows.append([start_window, resource_init])
            start_window = scheduled_task.finish

            last_alloc = max(last_alloc, start_window)

        # Fit the task within the first possible window
        window_found = False
        if task_earliest_start is None:
            task_earliest_start = 0

        for window in open_windows:
            # Task could start before the open window closes
            if task_earliest_start <= window[1]:
                # Now let's see if it fits there
                potential_start = max(task_earliest_start, window[0])
                if potential_start + task_duration <= window[1]:
                    # Task fits into the window
                    min_earliest_start = potential_start
                    window_found = True
                    break

        # If no window was found, schedule it after the end of the last task on the resource
        if not window_found:
            if task_earliest_start > 0:
                min_earliest_start = max(task_earliest_start, last_alloc)
            else:
                min_earliest_start = last_alloc

        # schedule it
        selected_task.sequence = self.num_scheduled_tasks + 1
        selected_task.start = min_earliest_start
        selected_task.finish = min_earliest_start + task_duration
        selected_task.resource = action[0]

        # 사실 여기서 color랑 order를 주는건 적절치 않은 코드임!!!!
        selected_task.color = self.orders[action[1]].color
        selected_task.order = action[1]

        self.current_schedule.append(selected_task)
        selected_resource.task_schedule.append(selected_task)
        self.num_scheduled_tasks += 1
        return

    def _get_final_task_finish(self):
        return max(self.current_schedule, key=lambda x: x.finish).finish
        
    def _calculate_total_reward(self):
        scale_factor = 0
        for task in self.current_schedule:
            scale_factor += task.duration
        # reward = reward / self._get_final_task_finish()
        return sum([order.reward for order in self.orders])/scale_factor #+ sum([resource.reward for resource in self.resources])) / scale_factor
    
    def _calculate_step_reward(self, action):
        # Hall 리워드 초기화
        hall_resource = 0
        hall_order = 0
        
        # 선택된 리소스와 주문
        selected_resource = self.resources[action[0]]
        selected_order = self.orders[action[1]]
    
        # 선택된 리소스의 스케줄링된 Task들
        scheduled_tasks = sorted(selected_resource.task_schedule, key=lambda task: task.start)
        sum_duration = 0
        for task in scheduled_tasks:
            sum_duration += task.duration
        
        if len(scheduled_tasks) >= 2:
            # 리소스의 스케줄링된 Task 사이의 간격을 계산하여 Hall 리워드에 더합니다.
            for i in range(1, len(scheduled_tasks)):
                gap = scheduled_tasks[i].start - scheduled_tasks[i - 1].finish
                hall_resource += gap

        # 선택된 주문의 수행된 Task들
        performed_tasks = [task for task in selected_order.task_queue if task.finish is not None]
        sum_performed_duration = 0
        for task in performed_tasks:
            sum_performed_duration += task.duration
        if len(performed_tasks) >= 2:
            # 주문의 수행된 Task 사이의 간격을 계산하여 Hall 리워드에 더합니다.
            for i in range(1, len(performed_tasks)):
                gap = performed_tasks[i].start - performed_tasks[i - 1].finish
                hall_order += gap
        
        selected_resource.reward += (sum_duration - hall_resource)
        selected_order.reward += (sum_performed_duration - hall_order)

    def _get_observation(self):
        observation = {
            'action_mask': self.legal_actions,
            'real_observation': self.state
            }

        return observation

In [19]:
env = SchedulingEnv()
check_env(env)



In [20]:
env = SchedulingEnv()

step = 0
obs, _ = env.reset()

while True:
    step += 1
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    if done:
        print("Goal reached!", "reward=", reward)
        print(info['finish_time'])
        print(info['invalid_count'])
        print(info['resources_reward'])
        print(info['orders_reward'])
        break

Goal reached! reward= 7.974372735811082
1450
57
[1210, 720, 890, 990, 3120, 430, 20, 2820]
[520, 1100, 1530, 900, -840, 60, -390, 390, 470, 1800]


In [21]:
sum_finish_time = 0
for _ in range(1000):
    obs, _ = env.reset()

    while True:
        step += 1
        action = env.action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        if terminated:
            sum_finish_time += info['finish_time']
            break

print(sum_finish_time/1000)

1609.32


In [31]:
model = A2C('MultiInputPolicy', env, verbose=1).learn(1000000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 69       |
|    ep_rew_mean        | 12.6     |
| time/                 |          |
|    fps                | 1165     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -4.3     |
|    explained_variance | 0.575    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 5.35     |
|    value_loss         | 1.96     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 68.9     |
|    ep_rew_mean        | 7.62     |
| time/                 |          |
|    fps                | 1219     |
|    iterations         | 200      |
|    time_elapsed 

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 66.8     |
|    ep_rew_mean        | 46.6     |
| time/                 |          |
|    fps                | 284      |
|    iterations         | 6800     |
|    time_elapsed       | 119      |
|    total_timesteps    | 34000    |
| train/                |          |
|    entropy_loss       | -4.04    |
|    explained_variance | 0.959    |
|    learning_rate      | 0.0007   |
|    n_updates          | 6799     |
|    policy_loss        | 18.6     |
|    value_loss         | 33.1     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 66.4     |
|    ep_rew_mean        | 47.1     |
| time/                 |          |
|    fps                | 284      |
|    iterations         | 6900     |
|    time_elapsed       | 121      |
|    total_timesteps    | 34500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 60.4     |
|    ep_rew_mean        | 49.3     |
| time/                 |          |
|    fps                | 290      |
|    iterations         | 8200     |
|    time_elapsed       | 141      |
|    total_timesteps    | 41000    |
| train/                |          |
|    entropy_loss       | -3.54    |
|    explained_variance | -0.539   |
|    learning_rate      | 0.0007   |
|    n_updates          | 8199     |
|    policy_loss        | 4.66     |
|    value_loss         | 4.37     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 61       |
|    ep_rew_mean        | 48.3     |
| time/                 |          |
|    fps                | 290      |
|    iterations         | 8300     |
|    time_elapsed       | 142      |
|    total_timesteps    | 41500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 58.3     |
|    ep_rew_mean        | 55.7     |
| time/                 |          |
|    fps                | 295      |
|    iterations         | 9600     |
|    time_elapsed       | 162      |
|    total_timesteps    | 48000    |
| train/                |          |
|    entropy_loss       | -4.04    |
|    explained_variance | -16.7    |
|    learning_rate      | 0.0007   |
|    n_updates          | 9599     |
|    policy_loss        | 1.91     |
|    value_loss         | 9.85     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 58.1     |
|    ep_rew_mean        | 56.6     |
| time/                 |          |
|    fps                | 295      |
|    iterations         | 9700     |
|    time_elapsed       | 164      |
|    total_timesteps    | 48500    |
| train/                |          |
|

In [32]:
step = 0
obs, _ = env.reset()

while True:
    step += 1
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    if done:
        print(f"Final task finish time : {info['finish_time']}")
        print(info['finish_time'])
        print(info['invalid_count'])
        print(info['resources_reward'])
        print(info['orders_reward'])
        break

Final task finish time : 2120
2120
22
[260, 8730, -700, 40, 3780, -770, 340, 370]
[480, 140, 60, 540, -340, 20, 210, 370, -2160, 1800]


In [33]:
sum_finish = 0

for _ in range(1000):
    obs, info = env.reset()
    done = False

    while True:
        step += 1
        action = env.action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        if done:
            sum_finish += info['finish_time']
            break
            
print(sum_finish / 1000)

1624.57


In [34]:
obs, info = env.reset()
done = False

while True:
    action, _ = model.predict(obs, deterministic = False)
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    if done:
        print(f"Final task finish time : {info['finish_time']}")
        break

Final task finish time : 980


In [35]:
sum_finish = 0

for _ in range(1000):
    obs, info = env.reset()
    done = False

    while True:
        action, _ = model.predict(obs, deterministic = False)
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        if done:
            sum_finish += info['finish_time']
            break
            
print(sum_finish / 1000)

996.92


In [18]:
from sb3_contrib import MaskablePPO
from sb3_contrib.common.envs import InvalidActionEnvDiscrete
from sb3_contrib.common.maskable.evaluation import evaluate_policy
from sb3_contrib.common.maskable.utils import get_action_masks
from sb3_contrib.common.wrappers import ActionMasker

In [19]:
def mask_fn(env: gym.Env):
    # Do whatever you'd like in this function to return the action mask
    # for the current env. In this example, we assume the env has a
    # helpful method we can rely on.
    return env.get_action_mask()

In [20]:
env = ActionMasker(env, mask_fn)

In [21]:
#model = MaskablePPO('MultiInputPolicy', env, verbose=1).learn(50000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


RuntimeError: shape '[1, 6]' is invalid for input of size 12

In [None]:
obs, info = env.reset()
done = False
episode_reward = 0

while True:
    action_masks = get_action_masks(env)
    action, _states = model.predict(obs, action_masks=action_masks)

    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    episode_reward += reward
    if done:
        print(f"Episode reward: {episode_reward}")
        print(f"Final task finish time : {info['finish_time']}")
