Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 73 additions & 10 deletions src/irlwpytorch/MountainCar.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import gym
import numpy as np
import matplotlib.pyplot as plt


class MountainCar:

def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamma):
def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer):
if animation:
self.env = gym.make('MountainCar-v0', render_mode="human")
else:
Expand All @@ -13,6 +15,8 @@ def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamm
self.q_table = None
self.q_learning_rate = q_learning_rate
self.gamma = gamma
self.n_states = n_states
self.trainer = trainer

def __enter__(self):
return self
Expand Down Expand Up @@ -42,15 +46,6 @@ def idx_demo(self, one_feature):

return demonstrations

def idx_state(self, state):
env_low = self.env.observation_space.low
env_high = self.env.observation_space.high
env_distance = (env_high - env_low) / self.one_feature
position_idx = int((state[0] - env_low[0]) / env_distance[0])
velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
state_idx = position_idx + velocity_idx * self.one_feature
return state_idx

def idx_to_state(self, state):
""" Convert pos and vel about mounting car environment to the integer value"""
env_low = self.env.observation_space.low
Expand All @@ -74,3 +69,71 @@ def env_reset(self):

def env_step(self, action):
return self.env.step(action)

def train(self, theta_learning_rate):
demonstrations = self.idx_demo(self.one_feature)

expert = self.trainer.expert_feature_expectations(demonstrations)
learner_feature_expectations = np.zeros(self.n_states)
episodes, scores = [], []

for episode in range(30000):
state = self.env_reset()
score = 0

if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
learner = learner_feature_expectations / episode
self.trainer.maxent_irl(expert, learner, theta_learning_rate)

state = state[0]
while True:
state_idx = self.idx_to_state(state)
action = np.argmax(self.q_table[state_idx])
next_state, reward, done, _, _ = self.env_step(action)

irl_reward = self.trainer.get_reward(self.n_states, state_idx)
next_state_idx = self.idx_to_state(next_state)
self.update_q_table(state_idx, action, irl_reward, next_state_idx)

learner_feature_expectations += self.trainer.get_feature_matrix()[int(state_idx)]

score += reward
state = next_state
if done:
scores.append(score)
episodes.append(episode)
break

if episode % 100 == 0:
score_avg = np.mean(scores)
print('{} episode score is {:.2f}'.format(episode, score_avg))
plt.plot(episodes, scores, 'b')
plt.savefig("./learning_curves/maxent_30000.png")
np.save("./results/maxent_30000_table", arr=self.q_table)

def test(self):
episodes, scores = [], []

for episode in range(10):
state = self.env_reset()
score = 0

state = state[0]
while True:
self.env_render()
state_idx = self.idx_to_state(state)
action = np.argmax(self.q_table[state_idx])
next_state, reward, done, _, _ = self.env_step(action)

score += reward
state = next_state

if done:
scores.append(score)
episodes.append(episode)
plt.plot(episodes, scores, 'b')
plt.savefig("./learning_curves/maxent_test_30000.png")
break

if episode % 1 == 0:
print('{} episode score is {:.2f}'.format(episode, score))
Binary file modified src/irlwpytorch/learning_curves/maxent_30000.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified src/irlwpytorch/learning_curves/maxent_test_30000.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
119 changes: 14 additions & 105 deletions src/irlwpytorch/main.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,12 @@
"""
This is a skeleton file that can serve as a starting point for a Python
console script. To run this script uncomment the following lines in the
``[options.entry_points]`` section in ``setup.cfg``::

console_scripts =
fibonacci = irlwpytorch.skeleton:run

Then run ``pip install .`` (or ``pip install -e .`` for editable mode)
which will install the command ``fibonacci`` inside your current environment.

Besides console scripts, the header (i.e. until ``_logger``...) of this file can
also be used as template for Python modules.

Note:
This file can be renamed depending on your needs or safely removed if not needed.

References:
- https://setuptools.pypa.io/en/latest/userguide/entry_point.html
- https://pip.pypa.io/en/stable/reference/pip_install
"""

import argparse
import gym
import matplotlib.pyplot as plt
import logging
import numpy as np
import sys

from .MountainCar import MountainCar
from .MaxEntropyIRL import MaxEntropyIRL
from MountainCar import MountainCar
from MaxEntropyIRL import MaxEntropyIRL

# from irlwpytorch import __version__
#from irlwpytorch import __version__

__author__ = "HokageM"
__copyright__ = "HokageM"
Expand All @@ -55,7 +31,7 @@ def parse_args(args):
parser.add_argument(
"--version",
action="version",
# version=f"IRLwPytorch {__version__}",
# version=f"IRLwPytorch {__version__}",
)
parser.add_argument('--training', action='store_true', help="Enables training of model.")
parser.add_argument('--testing', action='store_true',
Expand Down Expand Up @@ -92,103 +68,36 @@ def main(args):
n_states = 400 # position - 20, velocity - 20
n_actions = 3
one_feature = 20 # number of state per one feature
feature_matrix = np.eye((n_states)) # (400, 400)
feature_matrix = np.eye(n_states) # (400, 400)

gamma = 0.99
q_learning_rate = 0.03
theta_learning_rate = 0.05

car = None
if args.render:
car = MountainCar(True, feature_matrix, one_feature, q_learning_rate, gamma)
else:
car = MountainCar(False, feature_matrix, one_feature, q_learning_rate, gamma)

theta = -(np.random.uniform(size=(n_states,)))
trainer = MaxEntropyIRL(feature_matrix, theta)

if args.render:
car = MountainCar(True, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer)
else:
car = MountainCar(False, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer)

if args.training:
q_table = np.zeros((n_states, n_actions)) # (400, 3)
q_table = np.zeros((n_states, n_actions))
car.set_q_table(q_table)

demonstrations = car.idx_demo(one_feature)

expert = trainer.expert_feature_expectations(demonstrations)
learner_feature_expectations = np.zeros(n_states)
episodes, scores = [], []

for episode in range(30000):
state = car.env_reset()
score = 0

if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
learner = learner_feature_expectations / episode
trainer.maxent_irl(expert, learner, theta_learning_rate)

state = state[0]
while True:
state_idx = car.idx_state(state)
action = np.argmax(q_table[state_idx])
next_state, reward, done, _, _ = car.env_step(action)

irl_reward = trainer.get_reward(n_states, state_idx)
next_state_idx = car.idx_state(next_state)
car.update_q_table(state_idx, action, irl_reward, next_state_idx)

learner_feature_expectations += trainer.get_feature_matrix()[int(state_idx)]

score += reward
state = next_state
if done:
scores.append(score)
episodes.append(episode)
break

if episode % 100 == 0:
score_avg = np.mean(scores)
print('{} episode score is {:.2f}'.format(episode, score_avg))
plt.plot(episodes, scores, 'b')
plt.savefig("./learning_curves/maxent_30000.png")
np.save("./results/maxent_30000_table", arr=q_table)
car.train(theta_learning_rate)

if args.testing:
q_table = np.load(file="results/maxent_q_table.npy") # (400, 3)
q_table = np.load(file="./results/maxent_q_table.npy") # (400, 3)
car.set_q_table(q_table)

episodes, scores = [], []

for episode in range(10):
state = car.env_reset()
score = 0

state = state[0]
while True:
car.env_render()
state_idx = car.idx_to_state(state)
action = np.argmax(q_table[state_idx])
next_state, reward, done, _, _ = car.env_step(action)

score += reward
state = next_state

if done:
scores.append(score)
episodes.append(episode)
plt.plot(episodes, scores, 'b')
plt.savefig("./learning_curves/maxent_test_30000.png")
break

if episode % 1 == 0:
print('{} episode score is {:.2f}'.format(episode, score))
car.test()

_logger.info("Script ends here")


def run():
"""Calls :func:`main` passing the CLI arguments extracted from :obj:`sys.argv`

This function can be used as entry point to create console scripts with setuptools.
"""
main(sys.argv[1:])


Expand Down
Binary file modified src/irlwpytorch/results/maxent_30000_table.npy
Binary file not shown.