In [None]:
import JSSP
import time
import numpy as np
import random
import gym
import itertools
import numpy as np
import matplotlib
import matplotlib.style
import pandas as pd
import sys


from collections import defaultdict
#import plotting

# Environment Initialization

In [None]:
def create_env(instance_path):
    env_name = "JSSP-v0"
    env = gym.make(env_name, instance_path = instance_path)
    print("Environment Created for: ", instance_path)
    print("Observation space: \n", env.observation_space)
    print("Action space: \n", env.action_space)
    return env

In [None]:
env1 = create_env("instance1.txt")
env3 = create_env("instance3.txt")
env4 = create_env("instance4.txt")

# Baselines

## 1. Random Sampling

In [None]:
def random_sampling(env, episodes):
    env.reset()
    max_score = -100000
    max_episode = -1
    max_action_list = []
    max_time_list = []
    for episode in range(1, episodes+1):
        env.reset()
        done = False
        score = 0
        action_list = []
        time_list = []
        while not done:
            #env.render()
            action = env.action_space.sample()
            if action != env.action_space.n -1:
                action_list.append(env.legal_allocation_list[action])
                time_list.append(env.time)
                print('Episode:{} Allocation:{} Time:{}'.format(episode, env.legal_allocation_list[action], env.time))
            n_state, reward, done, info = env.step(action)
            score+=reward
        print('Episode:{} Total_reward:{}'.format(episode, score))
        if score >= max_score:
            max_score = score
            max_episode = episode
            max_action_list = action_list
            max_time_list = time_list
    print('From {}th Episode best policy has reward {}'.format(max_episode, max_score))
    for i in range(len(max_action_list)):
        print('The allocation chose at time {} is {}'.format(max_time_list[i], max_action_list[i]))

random_sampling(env4, 100)

## 2. Q-Learning

In [None]:
def createEpsilonGreedyPolicy(Q, epsilon, num_actions):
	"""
	Creates an epsilon-greedy policy based
	on a given Q-function and epsilon.

	Returns a function that takes the state
	as an input and returns the probabilities
	for each action in the form of a numpy array
	of length of the action space(set of possible actions).
	"""
	def policyFunction(state):

		Action_probabilities = np.ones(num_actions,
				dtype = float) * epsilon / num_actions

		best_action = np.argmax(Q[state])
		Action_probabilities[best_action] += (1.0 - epsilon)
		return Action_probabilities

	return policyFunction

In [None]:
def update(Q, s_key, s_next_key, action, r, eta, gma):

    if s_key not in Q.keys():
        Q[s_key] = {}

    if s_next_key not in Q.keys() or Q[s_next_key] == {}:
        max_next_s_Q = 0
    else:
        max_next_s_Q = max(Q[s_next_key].values())

    if action not in Q[s_key].keys():
        Q[s_key][action] = 0
    Q[s_key][action] = Q[s_key][action] + eta * (r + gma * max_next_s_Q - Q[s_key][action])

    return Q

In [None]:
def q_learning(env, epis):

    max_score = -100000
    max_episode = -1
    max_action_list = []
    max_time_list = []

    # 1. Load Environment and Q-table structure
    Q_table = {}
    # 2. Parameters of Q-learning
    eta = .628
    gamma = .9
    epsilon = .1
    total_reward_list = [] # rewards per episode calculate
    # 3. Q-learning Algorithm
    for episode in range(epis):
        # Reset environment
        state = env.reset()
        total_reward = 0
        done = False
        action_list = []
        time_list = []

        # The Q-Table learning algorithm
        while not done:
            # env.render()
            # Choose action from Q table

            policy = createEpsilonGreedyPolicy(Q_table, epsilon, env.action_space.n)
            action_probabilities = policy(state)
            action = np.random.choice(np.arange(len(action_probabilities)), p = action_probabilities)
            if action != env.action_space.n -1:
                action_list.append(env.legal_allocation_list[action])
                time_list.append(env.time)
                print('Episode:{} Allocation:{} Time:{}'.format(episode, env.legal_allocation_list[action], env.time))
            #Get new state & reward from environment
            next_state, reward, done, _ = env.step(action)
            #Update Q-Table with new knowledge
            Q_table = update(Q_table, state, next_state, action, reward, eta, gamma)
            total_reward += reward
            state = next_state
        total_reward_list.append(total_reward)
        if total_reward >= max_score:
            max_score = total_reward
            max_episode = episode
            max_action_list = action_list
            max_time_list = time_list
        print("Episode: " + str(episode) + " has reward " + str(total_reward))

    print('From {}th Episode best policy has reward {}'.format(max_episode, max_score))
    for i in range(len(max_action_list)):
        print('The allocation chose at time {} is {}'.format(max_time_list[i], max_action_list[i]))

    return total_reward_list

In [None]:
total_reward_list = q_learning(env3, 500)

In [None]:
import matplotlib.pyplot as plt
plt.plot(total_reward_list)
plt.show

In [None]:
plt.plot(total_reward_list)
plt.show

In [None]:
random_sampling(env3, 5000)

## Q Learning from web

In [None]:
def createEpsilonGreedyPolicy(Q, epsilon, num_actions, env):
	"""
	Creates an epsilon-greedy policy based
	on a given Q-function and epsilon.

	Returns a function that takes the state
	as an input and returns the probabilities
	for each action in the form of a numpy array
	of length of the action space(set of possible actions).
	"""
	def policyFunction(state):

		Action_probabilities = np.ones(num_actions,
				dtype = float) * epsilon / num_actions

		best_action = np.argmax(Q[state])
		Action_probabilities[best_action] += (1.0 - epsilon)
		return Action_probabilities

	return policyFunction

In [None]:
def qLearning(env, num_episodes, discount_factor = 1.0,
							alpha = 0.6, epsilon = 0.1):
	"""
	Q-Learning algorithm: Off-policy TD control.
	Finds the optimal greedy policy while improving
	following an epsilon-greedy policy"""

	# Action value function
	# A nested dictionary that maps
	# state -> (action -> action-value).
	Q = defaultdict(lambda: np.zeros(env.action_space.n))

	# Keeps track of useful statistics
	stats = {
            "episode_lengths" : np.zeros(num_episodes),
		    "episode_rewards" : np.zeros(num_episodes)}

	# Create an epsilon greedy policy function
	# appropriately for environment action space
	policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n, env)

	# For every episode
	for ith_episode in range(num_episodes):

		# Reset the environment and pick the first action
		state = env.reset()

		for t in itertools.count():

			# get probabilities of all actions from current state
			action_probabilities = policy(state)

			# choose action according to
			# the probability distribution
			action = np.random.choice(np.arange(
					len(action_probabilities)),
					p = action_probabilities)

			# take action and get reward, transit to next state
			next_state, reward, done, _ = env.step(action)

			# Update statistics
			stats["episode_rewards"][ith_episode] += reward
			stats["episode_lengths"][ith_episode] = t

			# TD Update
			best_next_action = np.argmax(Q[next_state])
			td_target = reward + discount_factor * Q[next_state][best_next_action]
			td_delta = td_target - Q[state][action]
			Q[state][action] += alpha * td_delta

			# done is True if episode terminated
			if done:
				break

			state = next_state

	return Q, stats

In [None]:
Q, stats = qLearning(env1, 1000)
plt.plot(stats["episode_lengths"])
plt.plot(stats["episode_rewards"])
plt.show

## Q Learning from web

In [None]:
def createEpsilonGreedyPolicy(Q, epsilon, num_actions, env):
	"""
	Creates an epsilon-greedy policy based
	on a given Q-function and epsilon.

	Returns a function that takes the state
	as an input and returns the probabilities
	for each action in the form of a numpy array
	of length of the action space(set of possible actions).
	"""
	def policyFunction(state):

		Action_probabilities = np.ones(num_actions,
				dtype = float) * epsilon / num_actions

		best_action = np.argmax(Q[state])
		Action_probabilities[best_action] += (1.0 - epsilon)
		return Action_probabilities

	return policyFunction

In [None]:
def qLearning(env, num_episodes, discount_factor = 1.0,
							alpha = 0.6, epsilon = 0.1):
	"""
	Q-Learning algorithm: Off-policy TD control.
	Finds the optimal greedy policy while improving
	following an epsilon-greedy policy"""

	# Action value function
	# A nested dictionary that maps
	# state -> (action -> action-value).
	Q = defaultdict(lambda: np.zeros(env.action_space.n))

	# Keeps track of useful statistics
	stats = {
            "episode_lengths" : np.zeros(num_episodes),
		    "episode_rewards" : np.zeros(num_episodes)}

	# Create an epsilon greedy policy function
	# appropriately for environment action space
	policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n, env)

	# For every episode
	for ith_episode in range(num_episodes):

		# Reset the environment and pick the first action
		state = env.reset()

		for t in itertools.count():

			# get probabilities of all actions from current state
			action_probabilities = policy(state)

			# choose action according to
			# the probability distribution
			action = np.random.choice(np.arange(
					len(action_probabilities)),
					p = action_probabilities)

			# take action and get reward, transit to next state
			next_state, reward, done, _ = env.step(action)

			# Update statistics
			stats["episode_rewards"][ith_episode] += reward
			stats["episode_lengths"][ith_episode] = t

			# TD Update
			best_next_action = np.argmax(Q[next_state])
			td_target = reward + discount_factor * Q[next_state][best_next_action]
			td_delta = td_target - Q[state][action]
			Q[state][action] += alpha * td_delta

			# done is True if episode terminated
			if done:
				break

			state = next_state

	return Q, stats

In [None]:
def createEpsilonGreedyPolicy(Q, epsilon, num_actions, env):
	"""
	Creates an epsilon-greedy policy based
	on a given Q-function and epsilon.

	Returns a function that takes the state
	as an input and returns the probabilities
	for each action in the form of a numpy array
	of length of the action space(set of possible actions).
	"""
	def policyFunction(state):

		Action_probabilities = np.ones(num_actions,
				dtype = float) * epsilon / num_actions

		best_action = np.argmax(Q[state])
		Action_probabilities[best_action] += (1.0 - epsilon)
		return Action_probabilities

	return policyFunction

In [None]:
def qLearning(env, num_episodes, discount_factor = 1.0,
							alpha = 0.6, epsilon = 0.1):
	"""
	Q-Learning algorithm: Off-policy TD control.
	Finds the optimal greedy policy while improving
	following an epsilon-greedy policy"""

	# Action value function
	# A nested dictionary that maps
	# state -> (action -> action-value).
	Q = defaultdict(lambda: np.zeros(env.action_space.n))

	# Keeps track of useful statistics
	stats = {
            "episode_lengths" : np.zeros(num_episodes),
		    "episode_rewards" : np.zeros(num_episodes)}

	# Create an epsilon greedy policy function
	# appropriately for environment action space
	policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n, env)

	# For every episode
	for ith_episode in range(num_episodes):

		# Reset the environment and pick the first action
		state = env.reset()

		for t in itertools.count():

			# get probabilities of all actions from current state
			action_probabilities = policy(state)

			# choose action according to
			# the probability distribution
			action = np.random.choice(np.arange(
					len(action_probabilities)),
					p = action_probabilities)

			# take action and get reward, transit to next state
			next_state, reward, done, _ = env.step(action)

			# Update statistics
			stats["episode_rewards"][ith_episode] += reward
			stats["episode_lengths"][ith_episode] = t

			# TD Update
			best_next_action = np.argmax(Q[next_state])
			td_target = reward + discount_factor * Q[next_state][best_next_action]
			td_delta = td_target - Q[state][action]
			Q[state][action] += alpha * td_delta

			# done is True if episode terminated
			if done:
				break

			state = next_state

	return Q, stats

In [None]:
Q, stats = qLearning(env1, 1000)
plt.plot(stats["episode_lengths"])
plt.plot(stats["episode_rewards"])
plt.show

## Q Learning from web

In [None]:
def createEpsilonGreedyPolicy(Q, epsilon, num_actions, env):
	"""
	Creates an epsilon-greedy policy based
	on a given Q-function and epsilon.

	Returns a function that takes the state
	as an input and returns the probabilities
	for each action in the form of a numpy array
	of length of the action space(set of possible actions).
	"""
	def policyFunction(state):

		Action_probabilities = np.ones(num_actions,
				dtype = float) * epsilon / num_actions

		best_action = np.argmax(Q[state])
		Action_probabilities[best_action] += (1.0 - epsilon)
		return Action_probabilities

	return policyFunction

In [47]:
def qLearning(env, num_episodes, discount_factor = 1.0,
							alpha = 0.6, epsilon = 0.1):
	"""
	Q-Learning algorithm: Off-policy TD control.
	Finds the optimal greedy policy while improving
	following an epsilon-greedy policy"""

	# Action value function
	# A nested dictionary that maps
	# state -> (action -> action-value).
	Q = defaultdict(lambda: np.zeros(env.action_space.n))

	# Keeps track of useful statistics
	stats = {
            "episode_lengths" : np.zeros(num_episodes),
		    "episode_rewards" : np.zeros(num_episodes)}

	# Create an epsilon greedy policy function
	# appropriately for environment action space
	policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n, env)

	# For every episode
	for ith_episode in range(num_episodes):

		# Reset the environment and pick the first action
		state = env.reset()

		for t in itertools.count():

			# get probabilities of all actions from current state
			action_probabilities = policy(state)

			# choose action according to
			# the probability distribution
			action = np.random.choice(np.arange(
					len(action_probabilities)),
					p = action_probabilities)

			# take action and get reward, transit to next state
			next_state, reward, done, _ = env.step(action)

			# Update statistics
			stats["episode_rewards"][ith_episode] += reward
			stats["episode_lengths"][ith_episode] = t

			# TD Update
			best_next_action = np.argmax(Q[next_state])
			td_target = reward + discount_factor * Q[next_state][best_next_action]
			td_delta = td_target - Q[state][action]
			Q[state][action] += alpha * td_delta

			# done is True if episode terminated
			if done:
				break

			state = next_state

	return Q, stats

In [48]:
Q, stats = qLearning(env1, 1000)
plt.plot(stats["episode_lengths"])
plt.plot(stats["episode_rewards"])
plt.show

IndexError: index 1 is out of bounds for axis 0 with size 1