diff --git a/README.md b/README.md
index b97274f..b2a3667 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# IRLwPython
-
+
Inverse Reinforcement Learning Algorithm implementation with python.
diff --git a/logo/IRLwPython.jpg b/logo/IRLwPython.jpg
deleted file mode 100644
index 9ebeed7..0000000
Binary files a/logo/IRLwPython.jpg and /dev/null differ
diff --git a/logo/IRLwPython.png b/logo/IRLwPython.png
new file mode 100644
index 0000000..12dd0c0
Binary files /dev/null and b/logo/IRLwPython.png differ
diff --git a/src/irlwpython/MaxEntropyDeepIRL.py b/src/irlwpython/MaxEntropyDeepIRL.py
new file mode 100644
index 0000000..1586ab2
--- /dev/null
+++ b/src/irlwpython/MaxEntropyDeepIRL.py
@@ -0,0 +1,185 @@
+import gym
+import numpy as np
+import torch
+import torch.optim as optim
+import torch.nn as nn
+import matplotlib.pyplot as plt
+
+
+class ActorNetwork(nn.Module):
+ def __init__(self, num_inputs, num_output, hidden_size):
+ super(ActorNetwork, self).__init__()
+ self.fc1 = nn.Linear(num_inputs, hidden_size)
+ self.fc2 = nn.Linear(hidden_size, hidden_size)
+ self.fc3 = nn.Linear(hidden_size, num_output)
+
+ def forward(self, x):
+ x = nn.functional.relu(self.fc1(x))
+ x = nn.functional.relu(self.fc2(x))
+ return self.fc3(x) # torch.nn.functional.softmax(self.fc3(x))
+
+
+class CriticNetwork(nn.Module):
+ def __init__(self, num_inputs, hidden_size):
+ super(CriticNetwork, self).__init__()
+ self.fc1 = nn.Linear(num_inputs, hidden_size)
+ self.fc2 = nn.Linear(hidden_size, hidden_size)
+ self.fc3 = nn.Linear(hidden_size, 1)
+
+ self.theta_layer = nn.Linear(hidden_size, 3)
+
+ def forward(self, x):
+ x_ = nn.functional.relu(self.fc1(x))
+ x_ = nn.functional.relu(self.fc2(x_))
+ theta_ = self.theta_layer(x_)
+ return self.fc3(x_) + torch.matmul(theta_, x)
+
+
+class MaxEntropyDeepIRL:
+ def __init__(self, target, state_dim, action_dim, learning_rate=0.001, gamma=0.99, num_epochs=1000):
+ self.target = target
+ self.state_dim = state_dim
+ self.action_dim = action_dim
+ self.learning_rate = learning_rate
+ # self.theta = torch.rand(state_dim + 1, requires_grad=True)
+ self.gamma = gamma
+ self.num_epochs = num_epochs
+ self.actor_network = ActorNetwork(state_dim, action_dim, 100)
+ self.critic_network = CriticNetwork(state_dim + 1, 100)
+ self.optimizer_actor = optim.Adam(self.actor_network.parameters(), lr=learning_rate)
+ self.optimizer_critic = optim.Adam(self.critic_network.parameters(), lr=learning_rate)
+
+ def get_reward(self, state, action):
+ state_action = list(state) + list([action])
+ state_action = torch.Tensor(state_action)
+ return self.critic_network(state_action)
+
+ def expert_feature_expectations(self, demonstrations):
+ feature_expectations = torch.zeros(self.state_dim)
+
+ for demonstration in demonstrations:
+ for state, _, _ in demonstration:
+ state_tensor = torch.tensor(state, dtype=torch.float32)
+ feature_expectations += state_tensor.squeeze()
+
+ feature_expectations /= demonstrations.shape[0]
+ return feature_expectations
+
+ def maxent_irl(self, expert, learner):
+ # Update critic network
+
+ self.optimizer_critic.zero_grad()
+
+ # Loss function for critic network
+ loss_critic = torch.nn.functional.mse_loss(learner, expert)
+ loss_critic.backward()
+
+ self.optimizer_critic.step()
+
+ def update_q_network(self, state_array, action, reward, next_state):
+ self.optimizer_actor.zero_grad()
+
+ state_tensor = torch.tensor(state_array, dtype=torch.float32)
+ next_state_tensor = torch.tensor(next_state, dtype=torch.float32)
+
+ q_values = self.actor_network(state_tensor)
+ # q_1 = self.actor_network(state_tensor)[action]
+ # q_2 = reward + self.gamma * max(self.actor_network(next_state_tensor))
+ next_q_values = reward + self.gamma * self.actor_network(next_state_tensor)
+
+ loss_actor = nn.functional.mse_loss(q_values, next_q_values)
+ loss_actor.backward()
+ self.optimizer_actor.step()
+
+ def get_demonstrations(self):
+ env_low = self.target.observation_space.low
+ env_high = self.target.observation_space.high
+ env_distance = (env_high - env_low) / 20 # self.one_feature
+
+ raw_demo = np.load(file="expert_demo/expert_demo.npy")
+ demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3))
+ for x in range(len(raw_demo)):
+ for y in range(len(raw_demo[0])):
+ position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0])
+ velocity_idx = int((raw_demo[x][y][1] - env_low[1]) / env_distance[1])
+ state_idx = position_idx + velocity_idx * 20 # self.one_feature
+
+ demonstrations[x][y][0] = state_idx
+ demonstrations[x][y][1] = raw_demo[x][y][2]
+
+ return demonstrations
+
+ def train(self):
+ demonstrations = self.get_demonstrations()
+ expert = self.expert_feature_expectations(demonstrations)
+
+ learner_feature_expectations = torch.zeros(self.state_dim, requires_grad=True) # Add requires_grad=True
+ episodes, scores = [], []
+
+ for episode in range(self.num_epochs):
+ state, info = self.target.reset()
+ score = 0
+
+ if (episode != 0 and episode == 10) or (episode > 10 and episode % 5 == 0):
+ learner = learner_feature_expectations / episode
+ self.maxent_irl(expert, learner)
+
+ while True:
+ state_tensor = torch.tensor(state, dtype=torch.float32)
+
+ q_state = self.actor_network(state_tensor)
+ action = torch.argmax(q_state).item()
+ next_state, reward, done, _, _ = self.target.step(action)
+
+ irl_reward = self.get_reward(state, action)
+ self.update_q_network(state, action, irl_reward, next_state)
+
+ print("Q Actor Network", state, q_state)
+ print("Reward", reward, "IRL Reward", irl_reward)
+
+ learner_feature_expectations = learner_feature_expectations + state_tensor.squeeze()
+
+ print(expert)
+ print(learner_feature_expectations)
+
+ score += reward
+ state = next_state
+ if done:
+ scores.append(score)
+ episodes.append(episode)
+ break
+
+ if episode % 1 == 0:
+ score_avg = np.mean(scores)
+ print('{} episode score is {:.2f}'.format(episode, score_avg))
+ plt.plot(episodes, scores, 'b')
+ plt.savefig("./learning_curves/maxent_30000_network.png")
+
+ torch.save(self.q_network.state_dict(), "./results/maxent_30000_q_network.pth")
+
+ def test(self):
+ episodes, scores = [], []
+
+ for episode in range(10):
+ state = self.target.reset()
+ score = 0
+
+ while True:
+ self.target.render()
+ state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
+
+ action = torch.argmax(self.q_network(state_tensor)).item()
+ next_state, reward, done, _, _ = self.target.step(action)
+
+ score += reward
+ state = next_state
+
+ if done:
+ scores.append(score)
+ episodes.append(episode)
+ plt.plot(episodes, scores, 'b')
+ plt.savefig("./learning_curves/maxent_test_30000_network.png")
+ break
+
+ if episode % 1 == 0:
+ print('{} episode score is {:.2f}'.format(episode, score))
diff --git a/src/irlwpython/MaxEntropyIRL.py b/src/irlwpython/MaxEntropyIRL.py
index fc933a2..7c8e8da 100644
--- a/src/irlwpython/MaxEntropyIRL.py
+++ b/src/irlwpython/MaxEntropyIRL.py
@@ -64,7 +64,7 @@ def maxent_irl(self, expert, learner, learning_rate):
# Clip theta
for j in range(len(self.theta)):
- if self.theta[j] > 0:
+ if self.theta[j] > 0: # log values
self.theta[j] = 0
def update_q_table(self, state, action, reward, next_state):
@@ -101,9 +101,11 @@ def train(self, theta_learning_rate):
state = self.target.env_reset()
score = 0
- # Mini-Batches ?
+ # Mini-Batches:
if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
+ # calculate density
learner = learner_feature_expectations / episode
+ # Maximum Entropy IRL step
self.maxent_irl(expert, learner, theta_learning_rate)
# One Step in environment
@@ -115,12 +117,16 @@ def train(self, theta_learning_rate):
# Run one timestep of the environment's dynamics.
next_state, reward, done, _, _ = self.target.env_step(action)
+ # get pseudo-reward and update q table
irl_reward = self.get_reward(self.n_states, state_idx)
next_state_idx = self.target.idx_to_state(next_state)
self.update_q_table(state_idx, action, irl_reward, next_state_idx)
+ # State counting for densitiy
learner_feature_expectations += self.get_feature_matrix()[int(state_idx)]
+ print(reward, irl_reward)
+
score += reward
state = next_state
if done:
diff --git a/src/irlwpython/expert_demo/expert_demo.p b/src/irlwpython/expert_demo/expert_demo.p
new file mode 100644
index 0000000..7eb39bd
Binary files /dev/null and b/src/irlwpython/expert_demo/expert_demo.p differ
diff --git a/src/irlwpython/learning_curves/maxent_30000.png b/src/irlwpython/learning_curves/maxent_30000.png
index f848113..90f9663 100644
Binary files a/src/irlwpython/learning_curves/maxent_30000.png and b/src/irlwpython/learning_curves/maxent_30000.png differ
diff --git a/src/irlwpython/learning_curves/maxent_test_30000.png b/src/irlwpython/learning_curves/maxent_test_30000.png
index 78214c5..84e9c8e 100644
Binary files a/src/irlwpython/learning_curves/maxent_test_30000.png and b/src/irlwpython/learning_curves/maxent_test_30000.png differ
diff --git a/src/irlwpython/main.py b/src/irlwpython/main.py
index 811efea..4c561d1 100644
--- a/src/irlwpython/main.py
+++ b/src/irlwpython/main.py
@@ -9,6 +9,8 @@
#from irlwpython import __version__
+import gym
+
__author__ = "HokageM"
__copyright__ = "HokageM"
__license__ = "MIT"
@@ -74,8 +76,9 @@ def main(args):
gamma = 0.99
q_learning_rate = 0.03
- theta_learning_rate = 0.05
+ # Theta works as Critic
+ theta_learning_rate = 0.05
theta = -(np.random.uniform(size=(n_states,)))
if args.render:
@@ -83,9 +86,17 @@ def main(args):
else:
car = MountainCar(False, one_feature)
- #if args.deep:
- # deep = MaxEntropyDeepIRL()
- # deep.run()
+ if args.deep:
+
+ # Create MountainCar environment
+ env = gym.make('MountainCar-v0', render_mode="human")
+ state_dim = env.observation_space.shape[0]
+ action_dim = env.action_space.n
+
+ # Run MaxEnt Deep IRL using MountainCar environment
+ maxent_deep_irl_agent = MaxEntropyDeepIRL(env, state_dim, action_dim)
+ maxent_deep_irl_agent.train()
+ maxent_deep_irl_agent.test()
if args.training:
q_table = np.zeros((n_states, n_actions))
diff --git a/src/irlwpython/results/maxent_30000_table.npy b/src/irlwpython/results/maxent_30000_table.npy
index 3f4aa26..bee0b76 100644
Binary files a/src/irlwpython/results/maxent_30000_table.npy and b/src/irlwpython/results/maxent_30000_table.npy differ
diff --git a/src/irlwpython/utils/utils.py b/src/irlwpython/utils/utils.py
new file mode 100644
index 0000000..bb81f4c
--- /dev/null
+++ b/src/irlwpython/utils/utils.py
@@ -0,0 +1,46 @@
+# from: https://github.com/reinforcement-learning-kr/lets-do-irl/
+
+import math
+import torch
+from torch.distributions import Normal
+
+
+def get_action(mu, std):
+ action = torch.normal(mu, std)
+ action = action.data.numpy()
+ action_list = [0, 1, 2]
+ return min(action_list, key=lambda x: abs(x - action))
+
+
+def get_entropy(mu, std):
+ dist = Normal(mu, std)
+ entropy = dist.entropy().mean()
+ return entropy
+
+
+def log_prob_density(x, mu, std):
+ log_prob_density = -(x - mu).pow(2) / (2 * std.pow(2)) \
+ - 0.5 * math.log(2 * math.pi)
+ return log_prob_density.sum(1, keepdim=True)
+
+
+def get_reward(discrim, state, action):
+ print("Input get reward")
+ print("state", state)
+ print("action", action)
+
+ state = torch.Tensor(state)
+ action = torch.Tensor(action)
+ state_action = torch.cat([state, action])
+
+ print("HELP")
+ print("state", state)
+ print("action", action)
+ print("state_action", state_action)
+
+ with torch.no_grad():
+ return -math.log(discrim(state_action)[0].item())
+
+
+def save_checkpoint(state, filename):
+ torch.save(state, filename)
diff --git a/src/irlwpython/utils/zfilter.py b/src/irlwpython/utils/zfilter.py
new file mode 100644
index 0000000..4c9a6e6
--- /dev/null
+++ b/src/irlwpython/utils/zfilter.py
@@ -0,0 +1,86 @@
+import numpy as np
+
+
+# from https://github.com/joschu/modular_rl
+# http://www.johndcook.com/blog/standard_deviation/
+
+class RunningStat(object):
+ def __init__(self, shape):
+ self._n = 0
+ self._M = np.zeros(shape)
+ self._S = np.zeros(shape)
+
+ def push(self, x):
+ x = np.asarray(x)
+ assert x.shape == self._M.shape
+ self._n += 1
+ if self._n == 1:
+ self._M[...] = x
+ else:
+ oldM = self._M.copy()
+ self._M[...] = oldM + (x - oldM) / self._n
+ self._S[...] = self._S + (x - oldM) * (x - self._M)
+
+ @property
+ def n(self):
+ return self._n
+
+ @n.setter
+ def n(self, n):
+ self._n = n
+
+ @property
+ def mean(self):
+ return self._M
+
+ @mean.setter
+ def mean(self, M):
+ self._M = M
+
+ @property
+ def sum_square(self):
+ return self._S
+
+ @sum_square.setter
+ def sum_square(self, S):
+ self._S = S
+
+ @property
+ def var(self):
+ return self._S / (self._n - 1) if self._n > 1 else np.square(self._M)
+
+ @property
+ def std(self):
+ return np.sqrt(self.var)
+
+ @property
+ def shape(self):
+ return self._M.shape
+
+
+class ZFilter:
+ """
+ y = (x-mean)/std
+ using running estimates of mean,std
+ """
+
+ def __init__(self, shape, demean=True, destd=True, clip=10.0):
+ self.demean = demean
+ self.destd = destd
+ self.clip = clip
+
+ self.rs = RunningStat(shape)
+
+ def __call__(self, x, update=True):
+ if update: self.rs.push(x)
+
+ if self.demean:
+ x = x - self.rs.mean
+
+ if self.destd:
+ x = x / (self.rs.std + 1e-8)
+
+ if self.clip:
+ x = np.clip(x, -self.clip, self.clip)
+
+ return x
\ No newline at end of file