In [None]:
import JSSP
import time
import numpy as np
import random
import gym
import itertools
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.style
import pandas as pd
import sys


from collections import defaultdict
#import plotting

# Environment Initialization

In [None]:
def create_env(instance_path):
    env_name = "JSSP-v0"
    env = gym.make(env_name, instance_path = instance_path)
    print("Environment Created for: ", instance_path)
    print("Observation space: \n", env.observation_space)
    print("Action space: \n", env.action_space)
    return env

In [None]:
env1 = create_env("instance1.txt") #-53
env3 = create_env("instance3.txt") #-981
env4 = create_env("instance4.txt")
env5 = create_env("instance5.txt")

# Baselines

## 1. Random Sampling

In [None]:
def random_sampling(env, episodes):
    env.reset()
    max_score = -100000
    max_episode = -1
    max_action_list = []
    max_time_list = []
    for episode in range(1, episodes+1):
        env.reset()
        done = False
        score = 0
        action_list = []
        time_list = []
        while not done:
            #env.render()
            action = env.action_space.sample()
            if action != env.action_space.n -1:
                action_list.append(env.legal_allocation_list[action])
                time_list.append(env.time)
                # print('Episode:{} Allocation:{} Time:{}'.format(episode, env.legal_allocation_list[action], env.time))
            n_state, reward, done, info = env.step(action)
            score+=reward
        # print('Episode:{} Total_reward:{}'.format(episode, score))
        if score >= max_score:
            max_score = score
            max_episode = episode
            max_action_list = action_list
            max_time_list = time_list
    print('From {}th Episode best policy has reward {}'.format(max_episode, max_score))
    for i in range(len(max_action_list)):
        print('The allocation chose at time {} is {}'.format(max_time_list[i], max_action_list[i]))

## 2. Q-Learning

In [None]:
def policy(state, Q, epsilon, num_actions):
    """
    function that returns the probability for action choosing based on a given Q-function and epsilon at this state
    :param state: tuple of order 2 * number of jobs
    :param Q: Q table which is a dictionary with states as first class keys and actions as second class keys
    :param epsilon: for epsilon greedy choosing algorithm
    :param num_actions: number of legal actions at this state
    :return: an array of order num_actions containing probability for action choosing at this state
    """
    # if state has been visited before, use epsilon greedy algorithm to generate probability
    if state in Q:
        best_action = np.argmax(Q[state])
        Action_probabilities = np.ones(num_actions, dtype = float) * epsilon / num_actions
        Action_probabilities[best_action] += (1.0 - epsilon)
        return Action_probabilities

    # if state hasn't been visited before, choose action with equal probability
    Action_probabilities = np.ones(num_actions, dtype = float) / num_actions
    return Action_probabilities

In [None]:
def update(Q, state, next_state, action, reward, eta, gamma, threshold):
    """
    Q-table updating step
    :param Q: Q table which is a dictionary with states as first class keys and actions as second class keys
    :param state: current state which is a tuple of order 2 * number of jobs
    :param next_state: next state which is a tuple of order 2 * number of jobs
    :param action: action between current state and next state
    :param reward: reward for the action
    :param eta: learning rate
    :param gamma: discounted factor
    :return: updated Q table
    """
    # if next state has no data, set the maximum Q value to be 0
    if next_state not in Q:
        Q_next_state_max = 0
    else:
        Q_next_state_max = max(Q[next_state])

    # update the Q table iff the difference is larger than a threshold
    Q_update = eta * (reward + gamma * Q_next_state_max - Q[state][action])
    if abs(Q_update) >= threshold:
        Q[state][action] = Q[state][action] + Q_update

    return Q

In [None]:
def q_learning(env, epis, eta = .628, gamma = 1, epsilon = .3, threshold = .001):
    """
    q-learning algorithm that returns the best policy, and data for each episode
    :param threshold: Threshold for checking convergence of Q_table
    :param eta: learning rate, how much you accept the new value vs the old value
    :param gamma: discount factor, balance immediate and future reward
    :param epsilon: randomness factor, how random the selection is
    :param decay_rate: how much less randomness for each episode:
    :param env: JSSP instance
    :param epis: number of episodes
    :return: a list of total reward for each episode, the final Q_table, and a list of total reward every 100 episodes
    """
    max_score = -100000
    max_episode = -1
    max_action_list = []
    max_time_list = []
    decay_rate = epsilon / epis

    # 1. Load Environment and Q-table structure
    Q_table = {}
    total_reward_list = [] # rewards per episode calculate
    testing_reward_list = [] # rewards per 100 episodes
    # 3. Q-learning Algorithm
    for episode in range(epis):
        # Reset environment
        state = env.reset()
        total_reward = 0
        done = False
        action_list = []
        time_list = []

        # Epsilon decay
        if epsilon > decay_rate:
            epsilon -= decay_rate
        # The Q-Table learning algorithm
        while not done:
            # initialize state in Q table
            if state not in Q_table:
                Q_table[state] = np.zeros(env.action_space.n)
            # generate action choosing probability and choose an action
            # if this is a testing episode, epsilon = 1
            action_probabilities = policy(state, Q_table, 0 if (episode % 100 == 0) else epsilon, env.action_space.n)
            action = np.random.choice(np.arange(len(action_probabilities)), p = action_probabilities)
            # update action history
            action_list.append(env.legal_allocation_list[action])
            time_list.append(env.time)
            # get new state & reward from environment
            next_state, reward, done, _ = env.step(action)
            #Update Q-Table with new knowledge
            Q_table = update(Q_table, state, next_state, action, reward, eta, gamma, threshold)
            total_reward += reward
            state = next_state
        if state not in Q_table:
                Q_table[state] = np.zeros(1)
        total_reward_list.append(total_reward)
        # keep policy testing results every 100 episodes
        if episode % 100 == 0:
            testing_reward_list.append(total_reward)
            print("Episode: " + str(episode) + " has time " + str(env.time) + " has reward " + str(total_reward))
        # keep track of the optimal policy
        if total_reward >= max_score:
            max_score = total_reward
            max_episode = episode
            max_action_list = action_list
            max_time_list = time_list

    print('From {}th Episode best policy has reward {}'.format(max_episode + 1, max_score))
    for i in range(len(max_action_list)):
        print('The allocation chose at time {} is {}'.format(max_time_list[i], max_action_list[i]))

    return total_reward_list, Q_table, testing_reward_list

training_reward_list, Q_table, testing_reward_list = q_learning(env5, 500000)

# Analysis

## Random Sampling

In [None]:
random_sampling(env1, 100)

## QLearning

In [None]:
training_reward_list, Q_table, testing_reward_list = q_learning(env1, 5000)

In [None]:
plt.plot(training_reward_list)
plt.title("QLearning: Training")
plt.xlabel("Episode #")
plt.ylabel("Total_reward")
plt.show()

In [None]:
plt.plot(testing_reward_list)
plt.title("QLearning: Testing")
plt.xlabel("Episode #")
plt.ylabel("Total_reward")
plt.show()

In [None]:
print(Q_table)