In [None]:
import JSSP
import gym
import time
import numpy as np
import random

# Environment Initialization

In [None]:
env_name = "JSSP-v0"
env = gym.make(env_name, instance_path = "instance3.txt")
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

# Action Choosing

In [None]:
def get_legal_action(env):
    legal_actions = env.get_legal_actions()
    legal_action = []
    for job in range(len(legal_actions)):
        if len(legal_actions[job]) == 0:
            legal_action.append(-1)
        else:
            legal_job_actions = [machine for machine in legal_actions[job] if machine not in legal_action]
            legal_job_actions.append(-1)
            legal_action.append(np.random.choice(legal_job_actions))
    return np.array(legal_action)

In [None]:
def get_efficient_legal_action(env):
    legal_actions = env.get_legal_actions()
    legal_action = []
    for job in range(len(legal_actions)):
        if len(legal_actions[job]) == 0:
            legal_action.append(-1)
        else:
            legal_job_actions = [machine for machine in legal_actions[job] if machine not in legal_action]
            if len(legal_job_actions) == 0:
                legal_action.append(-1)
            else:
                legal_action.append(np.random.choice(legal_job_actions))
    return np.array(legal_action)


# Baselines

## 1. Random Sampling

In [None]:
state = env.reset()
episodes = 50
max_score = -100000
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        #env.render()
        action = get_legal_action(env)
        # if not (np.all(action == -1)):
        #     print('Episode:{} Action:{} Time:{}'.format(episode, action, env.time))
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Total_reward:{}'.format(episode, score))
    max_score = max(max_score, score)
print('From {} Episodes best policy has reward {}'.format(episodes, max_score))

In [None]:
state = env.reset()
episodes = 50
max_score = -100000
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        #env.render()
        action = get_efficient_legal_action(env)
        # if not (np.all(action == -1)):
        #     print('Episode:{} Action:{} Time:{}'.format(episode, action, env.time))
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Total_reward:{}'.format(episode, score))
    max_score = max(max_score, score)
print('From {} Episodes best policy has reward {}'.format(episodes, max_score))

## 2. Q-Learning

In [None]:
def update(Q, s_key, s_next_key, a_key, r, eta, gma):

    if s_key not in Q.keys():
        Q[s_key] = {}

    if s_next_key not in Q.keys() or Q[s_next_key] == {}:
        max_next_s_Q = 0
    else:
        max_next_s_Q = max(Q[s_next_key].values())

    if a_key not in Q[s_key].keys():
        Q[s_key][a_key] = 0
    Q[s_key][a_key] = Q[s_key][a_key] + eta * (r + gma * max_next_s_Q - Q[s_key][a_key])

    return Q

In [None]:
# 1. Load Environment and Q-table structure
env_name = "JSSP-v0"
env = gym.make(env_name, instance_path = "instance3.txt")
Q = {}
# 2. Parameters of Q-learning
eta = .628
gamma = .9
epis = 500
epsilon = .1
rev_list = [] # rewards per episode calculate
# 3. Q-learning Algorithm
for i in range(epis):
    # Reset environment
    s = env.reset()
    rAll = 0
    d = False
    # The Q-Table learning algorithm
    while not d:
        # env.render()
        # Choose action from Q table
        s_key = tuple(np.concatenate((s[1], s[0])))
        if s_key not in Q.keys() or Q[s_key] == {} or random.uniform(0, 1) < epsilon:
            a = get_legal_action(env)
        else:
            a = max(Q[s_key], key = Q[s_key].get)
        #Get new state & reward from environment
        s_next, r, d, _ = env.step(a)
        #Update Q-Table with new knowledge
        s_next_key = tuple(np.concatenate((s_next[1], s_next[0])))
        a_key = tuple(a)
        Q = update(Q, s_key, s_next_key, a_key, r, eta, gamma)
        rAll += r
        s = s_next
    rev_list.append(rAll)
    print("Episode: " + str(i) + " has reward " + str(rAll))
    # env.render()
# Code will stop at d == True, and render one state before it
print("Reward Sum on all episodes " + str(sum(rev_list)/epis))
print("Final Values Q-Table")
print(Q)

In [None]:
import matplotlib.pyplot as plt
plt.plot(rev_list)
plt.show

In [None]:
plt.plot(rev_list)
plt.show

In [None]:
state = env.reset()
episodes = 50
max_score = -100000
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        #env.render()
        action = get_legal_action(env)
        # if not (np.all(action == -1)):
        #     print('Episode:{} Action:{} Time:{}'.format(episode, action, env.time))
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Total_reward:{}'.format(episode, score))
    max_score = max(max_score, score)
print('From {} Episodes best policy has reward {}'.format(episodes, max_score))

## 2. Q-Learning

In [None]:
# 1. Load Environment and Q-table structure
env_name = "JSSP-v0"
env = gym.make(env_name, instance_path = "instance1.txt")
Q = {}
# 2. Parameters of Q-learning
eta = .628
gma = .9
epis = 50
rev_list = [] # rewards per episode calculate
# 3. Q-learning Algorithm
for i in range(epis):
    # Reset environment
    s = env.reset()
    rAll = 0
    d = False
    # The Q-Table learning algorithm
    while not d:
        # env.render()
        # Choose action from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        #Get new state & reward from environment
        s1,r,d,_ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + eta*(r + gma*np.max(Q[s1,:]) - Q[s,a])
        rAll += r
        s = s1
    rev_list.append(rAll)
    print("Episode: " + str(i) + " has reward " + str(rAll))
    # env.render()
# Code will stop at d == True, and render one state before it
print("Reward Sum on all episodes " + str(sum(rev_list)/epis))
print("Final Values Q-Table")
print(Q)

# Baselines

## 1. Random Sampling

In [None]:
state = env.reset()
episodes = 50
max_score = -100000
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        #env.render()
        action = get_legal_action(env)
        # if not (np.all(action == -1)):
        #     print('Episode:{} Action:{} Time:{}'.format(episode, action, env.time))
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Total_reward:{}'.format(episode, score))
    max_score = max(max_score, score)
print('From {} Episodes best policy has reward {}'.format(episodes, max_score))

## 2. Q-Learning

In [None]:
# 1. Load Environment and Q-table structure
env_name = "JSSP-v0"
env = gym.make(env_name, instance_path = "instance1.txt")
Q = {}
# 2. Parameters of Q-learning
eta = .628
gma = .9
epis = 50
rev_list = [] # rewards per episode calculate
# 3. Q-learning Algorithm
for i in range(epis):
    # Reset environment
    s = env.reset()
    rAll = 0
    d = False
    # The Q-Table learning algorithm
    while not d:
        # env.render()
        # Choose action from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        #Get new state & reward from environment
        s1,r,d,_ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + eta*(r + gma*np.max(Q[s1,:]) - Q[s,a])
        rAll += r
        s = s1
    rev_list.append(rAll)
    print("Episode: " + str(i) + " has reward " + str(rAll))
    # env.render()
# Code will stop at d == True, and render one state before it
print("Reward Sum on all episodes " + str(sum(rev_list)/epis))
print("Final Values Q-Table")
print(Q)

In [None]:
# 1. Load Environment and Q-table structure
env_name = "JSSP-v0"
env = gym.make(env_name, instance_path = "instance1.txt")
Q = {}
# 2. Parameters of Q-learning
eta = .628
gma = .9
epis = 50
rev_list = [] # rewards per episode calculate
# 3. Q-learning Algorithm
for i in range(epis):
    # Reset environment
    s = env.reset()
    rAll = 0
    d = False
    # The Q-Table learning algorithm
    while not d:
        # env.render()
        # Choose action from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        #Get new state & reward from environment
        s1,r,d,_ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + eta*(r + gma*np.max(Q[s1,:]) - Q[s,a])
        rAll += r
        s = s1
    rev_list.append(rAll)
    print("Episode: " + str(i) + " has reward " + str(rAll))
    # env.render()
# Code will stop at d == True, and render one state before it
print("Reward Sum on all episodes " + str(sum(rev_list)/epis))
print("Final Values Q-Table")
print(Q)

# Baselines

## 1. Random Sampling

In [None]:
state = env.reset()
episodes = 50
max_score = -100000
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        #env.render()
        action = get_legal_action(env)
        # if not (np.all(action == -1)):
        #     print('Episode:{} Action:{} Time:{}'.format(episode, action, env.time))
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Total_reward:{}'.format(episode, score))
    max_score = max(max_score, score)
print('From {} Episodes best policy has reward {}'.format(episodes, max_score))

## 2. Q-Learning

In [None]:
# 1. Load Environment and Q-table structure
env_name = "JSSP-v0"
env = gym.make(env_name, instance_path = "instance1.txt")
Q = {}
# 2. Parameters of Q-learning
eta = .628
gma = .9
epis = 50
rev_list = [] # rewards per episode calculate
# 3. Q-learning Algorithm
for i in range(epis):
    # Reset environment
    s = env.reset()
    rAll = 0
    d = False
    # The Q-Table learning algorithm
    while not d:
        # env.render()
        # Choose action from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        #Get new state & reward from environment
        s1,r,d,_ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + eta*(r + gma*np.max(Q[s1,:]) - Q[s,a])
        rAll += r
        s = s1
    rev_list.append(rAll)
    print("Episode: " + str(i) + " has reward " + str(rAll))
    # env.render()
# Code will stop at d == True, and render one state before it
print("Reward Sum on all episodes " + str(sum(rev_list)/epis))
print("Final Values Q-Table")
print(Q)