# Select dispatching rule using Q-learning

# Prepare programming environment

## Load necessary packages

In [1]:
# Import packages
from mushroom_rl.core import Environment
from minisom import MiniSom
from mushroom_rl.algorithms.value import QLearning
from mushroom_rl.core import MDPInfo, Core
from mushroom_rl.policy import EpsGreedy
from mushroom_rl.utils.parameters import Parameter
from mushroom_rl.utils.spaces import Discrete
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from time import mktime
import pickle
from tqdm.notebook import tqdm
import pandas as pd
from datetime import datetime
import sys
sys.path.append("..")
from src.simulation import assign_priority_edd, assign_priority_mdd, assign_priority_spt, assign_priority_srpt, assign_priority_lpt, assign_priority_cr, assign_priority_ds, assign_priority_fifo, select_machine_winq
from src.models import PlantEnv, hyperparameter_tuning_mushroom, run_agent, train_som



## Get required data

In [2]:
# Load data for training SOM
with open(r"../data/processed/data_processed_classification.pickle", "rb") as output_file:
        data = pickle.load(output_file)
        data = pd.merge(data, pd.get_dummies(data["priority_rule_start"]), left_index = True, right_index = True)
        data.drop("priority_rule_start", axis = 1, inplace = True)

In [3]:
# Load data required for simulation
product_types_df = pd.read_csv("../data/external/product_types.csv")
product_types_df = product_types_df[product_types_df.id != 2]
with open(r"../data/interim/sim_data.pickle", "rb") as output_file:
    orders_df = pickle.load(output_file)
machines_df = pd.read_csv("../data/external/machine.csv")
machines_df = machines_df[machines_df.product_type_id != 2]

## Define constants

In [4]:
# Define constants for simulation
RANDOM_STATE = 42
priority_rules = [assign_priority_edd, assign_priority_spt, assign_priority_srpt, assign_priority_fifo, assign_priority_cr,
                  assign_priority_mdd, assign_priority_lpt, assign_priority_ds]
SIMULATION_START = mktime(datetime(2022, 11, 14, 5, 0, 0).timetuple()) * 1000
due_date_range_list = [(3, 10), (5, 14), (7, 21)]
number_orders_start_list = [80, 90, 100, 110]
average_count_new_orders_list = [80, 90, 100, 110]
worker_list =  [40, 50, 60, 70]
random_states = [7, 42, 66, 97, 108]
random_states_evaluation = [100, 101, 102]

# Hyperparameter tuning

In [5]:
# Create parameter grid
params_q = {"epsilon_param": [0.25, 0.5, 0.9],
            "learning_rate": [0.2, 0.6, 1.0],
            "number_of_states": [16, 32, 64],
            "sigma": [0.5, 1.0, 1.5, 2.0],
            "learning_rate_som": [0.1, 0.5, 0.9]
            }

In [6]:
# Hyperparameter tuning
# tuning_res =  hyperparameter_tuning_mushroom(params_q, 20, data, product_types_df, machines_df, orders_df, SIMULATION_START, priority_rules, due_date_range_list, number_orders_start_list, average_count_new_orders_list, worker_list, random_states, RANDOM_STATE)

In [7]:
# Save results of hyperparameter search
# tuning_res.to_csv("../data/processed/results_hyperparameter_select_q_learning.csv", index = False)

In [8]:
# Read hyperparameter results
tuning_res = pd.read_csv("../data/processed/hyperparameter_search/results_hyperparameter_select_q_learning.csv")

In [9]:
# Show best performing combination
tuning_res.sort_values(["reward_evaluation_mean"], ascending = False)

Unnamed: 0,epsilon_param,learning_rate,number_of_states,sigma,learning_rate_som,reward_evaluation_mean,reward_evaluation_var,reward_df
19,0.5,0.2,64,1.0,0.5,7595.498331,23933370.0,state_previous action reward stat...
4,0.25,0.6,16,1.0,0.9,7495.951348,25263820.0,state_previous action reward stat...
7,0.25,0.6,32,0.5,0.5,7479.991394,24087550.0,state_previous action reward stat...
17,0.25,0.2,64,2.0,0.1,7444.201133,24092980.0,state_previous action reward state...
1,0.25,0.6,64,1.0,0.5,7421.791986,24113510.0,state_previous action reward state...
15,0.25,0.6,64,0.5,0.1,7403.157619,24098540.0,state_previous action reward stat...
18,0.5,0.6,64,1.0,0.5,7384.570955,24471510.0,state_previous action reward state...
12,0.25,0.6,64,1.5,0.5,7367.769524,23839830.0,state_previous action reward stat...
2,0.5,1.0,64,0.5,0.1,7328.841823,24960980.0,state_previous action reward stat...
9,0.25,1.0,64,1.0,0.1,7312.888423,23003820.0,state_previous action reward stat...


In [10]:
# Analyse tuning results
tuning_res.groupby(["epsilon_param"])["reward_evaluation_mean"].mean().sort_values()

epsilon_param
0.90    7154.784767
0.50    7263.058562
0.25    7396.727736
Name: reward_evaluation_mean, dtype: float64

In [11]:
# Analyse tuning results
tuning_res.groupby(["learning_rate"])["reward_evaluation_mean"].mean().sort_values()

learning_rate
0.2    7262.866732
0.6    7309.069425
1.0    7320.865123
Name: reward_evaluation_mean, dtype: float64

In [12]:
# Analyse tuning results
tuning_res.groupby(["number_of_states"])["reward_evaluation_mean"].mean().sort_values()

number_of_states
32    7187.774045
16    7251.220720
64    7378.489754
Name: reward_evaluation_mean, dtype: float64

In [13]:
# Analyse tuning results
tuning_res.groupby(["sigma"])["reward_evaluation_mean"].mean().sort_values()

sigma
1.5    7190.496041
2.0    7250.092947
1.0    7347.688113
0.5    7403.996945
Name: reward_evaluation_mean, dtype: float64

In [14]:
# Analyse tuning results
tuning_res.groupby(["learning_rate_som"])["reward_evaluation_mean"].mean().sort_values()

learning_rate_som
0.9    7190.368771
0.1    7281.204106
0.5    7399.551697
Name: reward_evaluation_mean, dtype: float64

# Define and train final model

In [15]:
# Train final som
som, normalize_som = train_som(64, 64, data, 1.0,0.5, 42, number_iterations=10000)

 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 3.2128769427259583


In [16]:
# Define environment
env = Environment.make('PlantEnv', product_types_df=product_types_df, machines_df=machines_df, orders_df=orders_df,
                           simulation_start=SIMULATION_START, priority_rules=priority_rules,
                           allocation_rule=select_machine_winq, random_state=42, 
                       number_states=64 * 64,
                           number_actions=8,
                           som=som, normalize_som=normalize_som)

In [17]:
# Define Q-Learning agent
epsilon = Parameter(value=0.25)
policy = EpsGreedy(epsilon=epsilon)
learning_rate = Parameter(value=0.6)
agent = QLearning(env, policy, learning_rate)
core = Core(agent, env)

In [18]:
# Train Q-learning agent
# Duration ~ 25 hours
# for episode in tqdm(range(100)):
#     for random_state in random_states:
#         env.random_state = random_state
#         for due_date_range in due_date_range_list:
#             for number_orders_start in number_orders_start_list:
#                 for average_count_new_orders in average_count_new_orders_list:
#                     for worker in worker_list:
#                         # Change random state before each simulation to increase variety of data
#                         env.random_state += 1
#                         # Episode using act and observe
#                         env.due_date_range = due_date_range
#                         env.number_orders_start = number_orders_start
#                         env.average_count_new_orders = average_count_new_orders
#                         env.worker = worker
#                         core.learn(n_episodes=1, n_steps_per_fit=1, render=False, quiet=True)

# Evaluate model

In [19]:
# Run model for evaluate
# Duration ~
# reward_list = []
# for episode in tqdm(range(1)):
#     for random_state in random_states_evaluation:
#         env.random_state = random_state
#         for due_date_range in due_date_range_list:
#             for number_orders_start in number_orders_start_list:
#                 for average_count_new_orders in average_count_new_orders_list:
#                     for worker in worker_list:
#                         # Change random state before each simulation to increase variety of data
#                         env.random_state += 1
#                         # Episode using act and observe
#                         env.due_date_range = due_date_range
#                         env.number_orders_start = number_orders_start
#                         env.average_count_new_orders = average_count_new_orders
#                         env.worker = worker
#                         reward_list.extend(core.evaluate(n_episodes=1, render=False, quiet=True))

In [20]:
# Save results as dataframe
# results = pd.DataFrame(reward_list, columns = ["State_before", "Action", "Reward", "State_After", "_", "__"])
# results.to_csv("../data/processed/evaluation/simulation_results_validation_select_q_learning.csv", index = False)

In [21]:
# Load results
results = pd.read_csv("../data/processed/evaluation/simulation_results_validation_select_q_learning.csv")

In [22]:
# Calculate mean reward
results["Reward"].mean()

7454.576187690739