diff --git a/src/irlwpytorch/MountainCar.py b/src/irlwpytorch/MountainCar.py index 3e63af0..0cdcd8f 100644 --- a/src/irlwpytorch/MountainCar.py +++ b/src/irlwpytorch/MountainCar.py @@ -1,9 +1,11 @@ import gym import numpy as np +import matplotlib.pyplot as plt + class MountainCar: - def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamma): + def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer): if animation: self.env = gym.make('MountainCar-v0', render_mode="human") else: @@ -13,6 +15,8 @@ def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamm self.q_table = None self.q_learning_rate = q_learning_rate self.gamma = gamma + self.n_states = n_states + self.trainer = trainer def __enter__(self): return self @@ -42,15 +46,6 @@ def idx_demo(self, one_feature): return demonstrations - def idx_state(self, state): - env_low = self.env.observation_space.low - env_high = self.env.observation_space.high - env_distance = (env_high - env_low) / self.one_feature - position_idx = int((state[0] - env_low[0]) / env_distance[0]) - velocity_idx = int((state[1] - env_low[1]) / env_distance[1]) - state_idx = position_idx + velocity_idx * self.one_feature - return state_idx - def idx_to_state(self, state): """ Convert pos and vel about mounting car environment to the integer value""" env_low = self.env.observation_space.low @@ -74,3 +69,71 @@ def env_reset(self): def env_step(self, action): return self.env.step(action) + + def train(self, theta_learning_rate): + demonstrations = self.idx_demo(self.one_feature) + + expert = self.trainer.expert_feature_expectations(demonstrations) + learner_feature_expectations = np.zeros(self.n_states) + episodes, scores = [], [] + + for episode in range(30000): + state = self.env_reset() + score = 0 + + if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0): + learner = learner_feature_expectations / episode + self.trainer.maxent_irl(expert, learner, theta_learning_rate) + + state = state[0] + while True: + state_idx = self.idx_to_state(state) + action = np.argmax(self.q_table[state_idx]) + next_state, reward, done, _, _ = self.env_step(action) + + irl_reward = self.trainer.get_reward(self.n_states, state_idx) + next_state_idx = self.idx_to_state(next_state) + self.update_q_table(state_idx, action, irl_reward, next_state_idx) + + learner_feature_expectations += self.trainer.get_feature_matrix()[int(state_idx)] + + score += reward + state = next_state + if done: + scores.append(score) + episodes.append(episode) + break + + if episode % 100 == 0: + score_avg = np.mean(scores) + print('{} episode score is {:.2f}'.format(episode, score_avg)) + plt.plot(episodes, scores, 'b') + plt.savefig("./learning_curves/maxent_30000.png") + np.save("./results/maxent_30000_table", arr=self.q_table) + + def test(self): + episodes, scores = [], [] + + for episode in range(10): + state = self.env_reset() + score = 0 + + state = state[0] + while True: + self.env_render() + state_idx = self.idx_to_state(state) + action = np.argmax(self.q_table[state_idx]) + next_state, reward, done, _, _ = self.env_step(action) + + score += reward + state = next_state + + if done: + scores.append(score) + episodes.append(episode) + plt.plot(episodes, scores, 'b') + plt.savefig("./learning_curves/maxent_test_30000.png") + break + + if episode % 1 == 0: + print('{} episode score is {:.2f}'.format(episode, score)) diff --git a/src/irlwpytorch/learning_curves/maxent_30000.png b/src/irlwpytorch/learning_curves/maxent_30000.png index b7b84f9..31485da 100644 Binary files a/src/irlwpytorch/learning_curves/maxent_30000.png and b/src/irlwpytorch/learning_curves/maxent_30000.png differ diff --git a/src/irlwpytorch/learning_curves/maxent_test_30000.png b/src/irlwpytorch/learning_curves/maxent_test_30000.png index 8bee37a..a361f55 100644 Binary files a/src/irlwpytorch/learning_curves/maxent_test_30000.png and b/src/irlwpytorch/learning_curves/maxent_test_30000.png differ diff --git a/src/irlwpytorch/main.py b/src/irlwpytorch/main.py index 4ef4a14..4ba8e18 100644 --- a/src/irlwpytorch/main.py +++ b/src/irlwpytorch/main.py @@ -1,36 +1,12 @@ -""" -This is a skeleton file that can serve as a starting point for a Python -console script. To run this script uncomment the following lines in the -``[options.entry_points]`` section in ``setup.cfg``:: - - console_scripts = - fibonacci = irlwpytorch.skeleton:run - -Then run ``pip install .`` (or ``pip install -e .`` for editable mode) -which will install the command ``fibonacci`` inside your current environment. - -Besides console scripts, the header (i.e. until ``_logger``...) of this file can -also be used as template for Python modules. - -Note: - This file can be renamed depending on your needs or safely removed if not needed. - -References: - - https://setuptools.pypa.io/en/latest/userguide/entry_point.html - - https://pip.pypa.io/en/stable/reference/pip_install -""" - import argparse -import gym -import matplotlib.pyplot as plt import logging import numpy as np import sys -from .MountainCar import MountainCar -from .MaxEntropyIRL import MaxEntropyIRL +from MountainCar import MountainCar +from MaxEntropyIRL import MaxEntropyIRL -# from irlwpytorch import __version__ +#from irlwpytorch import __version__ __author__ = "HokageM" __copyright__ = "HokageM" @@ -55,7 +31,7 @@ def parse_args(args): parser.add_argument( "--version", action="version", - # version=f"IRLwPytorch {__version__}", + # version=f"IRLwPytorch {__version__}", ) parser.add_argument('--training', action='store_true', help="Enables training of model.") parser.add_argument('--testing', action='store_true', @@ -92,103 +68,36 @@ def main(args): n_states = 400 # position - 20, velocity - 20 n_actions = 3 one_feature = 20 # number of state per one feature - feature_matrix = np.eye((n_states)) # (400, 400) + feature_matrix = np.eye(n_states) # (400, 400) gamma = 0.99 q_learning_rate = 0.03 theta_learning_rate = 0.05 - car = None - if args.render: - car = MountainCar(True, feature_matrix, one_feature, q_learning_rate, gamma) - else: - car = MountainCar(False, feature_matrix, one_feature, q_learning_rate, gamma) - theta = -(np.random.uniform(size=(n_states,))) trainer = MaxEntropyIRL(feature_matrix, theta) + if args.render: + car = MountainCar(True, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer) + else: + car = MountainCar(False, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer) + if args.training: - q_table = np.zeros((n_states, n_actions)) # (400, 3) + q_table = np.zeros((n_states, n_actions)) car.set_q_table(q_table) - demonstrations = car.idx_demo(one_feature) - - expert = trainer.expert_feature_expectations(demonstrations) - learner_feature_expectations = np.zeros(n_states) - episodes, scores = [], [] - - for episode in range(30000): - state = car.env_reset() - score = 0 - - if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0): - learner = learner_feature_expectations / episode - trainer.maxent_irl(expert, learner, theta_learning_rate) - - state = state[0] - while True: - state_idx = car.idx_state(state) - action = np.argmax(q_table[state_idx]) - next_state, reward, done, _, _ = car.env_step(action) - - irl_reward = trainer.get_reward(n_states, state_idx) - next_state_idx = car.idx_state(next_state) - car.update_q_table(state_idx, action, irl_reward, next_state_idx) - - learner_feature_expectations += trainer.get_feature_matrix()[int(state_idx)] - - score += reward - state = next_state - if done: - scores.append(score) - episodes.append(episode) - break - - if episode % 100 == 0: - score_avg = np.mean(scores) - print('{} episode score is {:.2f}'.format(episode, score_avg)) - plt.plot(episodes, scores, 'b') - plt.savefig("./learning_curves/maxent_30000.png") - np.save("./results/maxent_30000_table", arr=q_table) + car.train(theta_learning_rate) if args.testing: - q_table = np.load(file="results/maxent_q_table.npy") # (400, 3) + q_table = np.load(file="./results/maxent_q_table.npy") # (400, 3) car.set_q_table(q_table) - episodes, scores = [], [] - - for episode in range(10): - state = car.env_reset() - score = 0 - - state = state[0] - while True: - car.env_render() - state_idx = car.idx_to_state(state) - action = np.argmax(q_table[state_idx]) - next_state, reward, done, _, _ = car.env_step(action) - - score += reward - state = next_state - - if done: - scores.append(score) - episodes.append(episode) - plt.plot(episodes, scores, 'b') - plt.savefig("./learning_curves/maxent_test_30000.png") - break - - if episode % 1 == 0: - print('{} episode score is {:.2f}'.format(episode, score)) + car.test() _logger.info("Script ends here") def run(): - """Calls :func:`main` passing the CLI arguments extracted from :obj:`sys.argv` - - This function can be used as entry point to create console scripts with setuptools. - """ main(sys.argv[1:]) diff --git a/src/irlwpytorch/results/maxent_30000_table.npy b/src/irlwpytorch/results/maxent_30000_table.npy index ba4d345..da63953 100644 Binary files a/src/irlwpytorch/results/maxent_30000_table.npy and b/src/irlwpytorch/results/maxent_30000_table.npy differ