diff --git a/README.md b/README.md index b97274f..b2a3667 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # IRLwPython - + Inverse Reinforcement Learning Algorithm implementation with python. diff --git a/logo/IRLwPython.jpg b/logo/IRLwPython.jpg deleted file mode 100644 index 9ebeed7..0000000 Binary files a/logo/IRLwPython.jpg and /dev/null differ diff --git a/logo/IRLwPython.png b/logo/IRLwPython.png new file mode 100644 index 0000000..12dd0c0 Binary files /dev/null and b/logo/IRLwPython.png differ diff --git a/src/irlwpython/MaxEntropyDeepIRL.py b/src/irlwpython/MaxEntropyDeepIRL.py new file mode 100644 index 0000000..1586ab2 --- /dev/null +++ b/src/irlwpython/MaxEntropyDeepIRL.py @@ -0,0 +1,185 @@ +import gym +import numpy as np +import torch +import torch.optim as optim +import torch.nn as nn +import matplotlib.pyplot as plt + + +class ActorNetwork(nn.Module): + def __init__(self, num_inputs, num_output, hidden_size): + super(ActorNetwork, self).__init__() + self.fc1 = nn.Linear(num_inputs, hidden_size) + self.fc2 = nn.Linear(hidden_size, hidden_size) + self.fc3 = nn.Linear(hidden_size, num_output) + + def forward(self, x): + x = nn.functional.relu(self.fc1(x)) + x = nn.functional.relu(self.fc2(x)) + return self.fc3(x) # torch.nn.functional.softmax(self.fc3(x)) + + +class CriticNetwork(nn.Module): + def __init__(self, num_inputs, hidden_size): + super(CriticNetwork, self).__init__() + self.fc1 = nn.Linear(num_inputs, hidden_size) + self.fc2 = nn.Linear(hidden_size, hidden_size) + self.fc3 = nn.Linear(hidden_size, 1) + + self.theta_layer = nn.Linear(hidden_size, 3) + + def forward(self, x): + x_ = nn.functional.relu(self.fc1(x)) + x_ = nn.functional.relu(self.fc2(x_)) + theta_ = self.theta_layer(x_) + return self.fc3(x_) + torch.matmul(theta_, x) + + +class MaxEntropyDeepIRL: + def __init__(self, target, state_dim, action_dim, learning_rate=0.001, gamma=0.99, num_epochs=1000): + self.target = target + self.state_dim = state_dim + self.action_dim = action_dim + self.learning_rate = learning_rate + # self.theta = torch.rand(state_dim + 1, requires_grad=True) + self.gamma = gamma + self.num_epochs = num_epochs + self.actor_network = ActorNetwork(state_dim, action_dim, 100) + self.critic_network = CriticNetwork(state_dim + 1, 100) + self.optimizer_actor = optim.Adam(self.actor_network.parameters(), lr=learning_rate) + self.optimizer_critic = optim.Adam(self.critic_network.parameters(), lr=learning_rate) + + def get_reward(self, state, action): + state_action = list(state) + list([action]) + state_action = torch.Tensor(state_action) + return self.critic_network(state_action) + + def expert_feature_expectations(self, demonstrations): + feature_expectations = torch.zeros(self.state_dim) + + for demonstration in demonstrations: + for state, _, _ in demonstration: + state_tensor = torch.tensor(state, dtype=torch.float32) + feature_expectations += state_tensor.squeeze() + + feature_expectations /= demonstrations.shape[0] + return feature_expectations + + def maxent_irl(self, expert, learner): + # Update critic network + + self.optimizer_critic.zero_grad() + + # Loss function for critic network + loss_critic = torch.nn.functional.mse_loss(learner, expert) + loss_critic.backward() + + self.optimizer_critic.step() + + def update_q_network(self, state_array, action, reward, next_state): + self.optimizer_actor.zero_grad() + + state_tensor = torch.tensor(state_array, dtype=torch.float32) + next_state_tensor = torch.tensor(next_state, dtype=torch.float32) + + q_values = self.actor_network(state_tensor) + # q_1 = self.actor_network(state_tensor)[action] + # q_2 = reward + self.gamma * max(self.actor_network(next_state_tensor)) + next_q_values = reward + self.gamma * self.actor_network(next_state_tensor) + + loss_actor = nn.functional.mse_loss(q_values, next_q_values) + loss_actor.backward() + self.optimizer_actor.step() + + def get_demonstrations(self): + env_low = self.target.observation_space.low + env_high = self.target.observation_space.high + env_distance = (env_high - env_low) / 20 # self.one_feature + + raw_demo = np.load(file="expert_demo/expert_demo.npy") + demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3)) + for x in range(len(raw_demo)): + for y in range(len(raw_demo[0])): + position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0]) + velocity_idx = int((raw_demo[x][y][1] - env_low[1]) / env_distance[1]) + state_idx = position_idx + velocity_idx * 20 # self.one_feature + + demonstrations[x][y][0] = state_idx + demonstrations[x][y][1] = raw_demo[x][y][2] + + return demonstrations + + def train(self): + demonstrations = self.get_demonstrations() + expert = self.expert_feature_expectations(demonstrations) + + learner_feature_expectations = torch.zeros(self.state_dim, requires_grad=True) # Add requires_grad=True + episodes, scores = [], [] + + for episode in range(self.num_epochs): + state, info = self.target.reset() + score = 0 + + if (episode != 0 and episode == 10) or (episode > 10 and episode % 5 == 0): + learner = learner_feature_expectations / episode + self.maxent_irl(expert, learner) + + while True: + state_tensor = torch.tensor(state, dtype=torch.float32) + + q_state = self.actor_network(state_tensor) + action = torch.argmax(q_state).item() + next_state, reward, done, _, _ = self.target.step(action) + + irl_reward = self.get_reward(state, action) + self.update_q_network(state, action, irl_reward, next_state) + + print("Q Actor Network", state, q_state) + print("Reward", reward, "IRL Reward", irl_reward) + + learner_feature_expectations = learner_feature_expectations + state_tensor.squeeze() + + print(expert) + print(learner_feature_expectations) + + score += reward + state = next_state + if done: + scores.append(score) + episodes.append(episode) + break + + if episode % 1 == 0: + score_avg = np.mean(scores) + print('{} episode score is {:.2f}'.format(episode, score_avg)) + plt.plot(episodes, scores, 'b') + plt.savefig("./learning_curves/maxent_30000_network.png") + + torch.save(self.q_network.state_dict(), "./results/maxent_30000_q_network.pth") + + def test(self): + episodes, scores = [], [] + + for episode in range(10): + state = self.target.reset() + score = 0 + + while True: + self.target.render() + state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0) + + action = torch.argmax(self.q_network(state_tensor)).item() + next_state, reward, done, _, _ = self.target.step(action) + + score += reward + state = next_state + + if done: + scores.append(score) + episodes.append(episode) + plt.plot(episodes, scores, 'b') + plt.savefig("./learning_curves/maxent_test_30000_network.png") + break + + if episode % 1 == 0: + print('{} episode score is {:.2f}'.format(episode, score)) diff --git a/src/irlwpython/MaxEntropyIRL.py b/src/irlwpython/MaxEntropyIRL.py index fc933a2..7c8e8da 100644 --- a/src/irlwpython/MaxEntropyIRL.py +++ b/src/irlwpython/MaxEntropyIRL.py @@ -64,7 +64,7 @@ def maxent_irl(self, expert, learner, learning_rate): # Clip theta for j in range(len(self.theta)): - if self.theta[j] > 0: + if self.theta[j] > 0: # log values self.theta[j] = 0 def update_q_table(self, state, action, reward, next_state): @@ -101,9 +101,11 @@ def train(self, theta_learning_rate): state = self.target.env_reset() score = 0 - # Mini-Batches ? + # Mini-Batches: if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0): + # calculate density learner = learner_feature_expectations / episode + # Maximum Entropy IRL step self.maxent_irl(expert, learner, theta_learning_rate) # One Step in environment @@ -115,12 +117,16 @@ def train(self, theta_learning_rate): # Run one timestep of the environment's dynamics. next_state, reward, done, _, _ = self.target.env_step(action) + # get pseudo-reward and update q table irl_reward = self.get_reward(self.n_states, state_idx) next_state_idx = self.target.idx_to_state(next_state) self.update_q_table(state_idx, action, irl_reward, next_state_idx) + # State counting for densitiy learner_feature_expectations += self.get_feature_matrix()[int(state_idx)] + print(reward, irl_reward) + score += reward state = next_state if done: diff --git a/src/irlwpython/expert_demo/expert_demo.p b/src/irlwpython/expert_demo/expert_demo.p new file mode 100644 index 0000000..7eb39bd Binary files /dev/null and b/src/irlwpython/expert_demo/expert_demo.p differ diff --git a/src/irlwpython/learning_curves/maxent_30000.png b/src/irlwpython/learning_curves/maxent_30000.png index f848113..90f9663 100644 Binary files a/src/irlwpython/learning_curves/maxent_30000.png and b/src/irlwpython/learning_curves/maxent_30000.png differ diff --git a/src/irlwpython/learning_curves/maxent_test_30000.png b/src/irlwpython/learning_curves/maxent_test_30000.png index 78214c5..84e9c8e 100644 Binary files a/src/irlwpython/learning_curves/maxent_test_30000.png and b/src/irlwpython/learning_curves/maxent_test_30000.png differ diff --git a/src/irlwpython/main.py b/src/irlwpython/main.py index 811efea..4c561d1 100644 --- a/src/irlwpython/main.py +++ b/src/irlwpython/main.py @@ -9,6 +9,8 @@ #from irlwpython import __version__ +import gym + __author__ = "HokageM" __copyright__ = "HokageM" __license__ = "MIT" @@ -74,8 +76,9 @@ def main(args): gamma = 0.99 q_learning_rate = 0.03 - theta_learning_rate = 0.05 + # Theta works as Critic + theta_learning_rate = 0.05 theta = -(np.random.uniform(size=(n_states,))) if args.render: @@ -83,9 +86,17 @@ def main(args): else: car = MountainCar(False, one_feature) - #if args.deep: - # deep = MaxEntropyDeepIRL() - # deep.run() + if args.deep: + + # Create MountainCar environment + env = gym.make('MountainCar-v0', render_mode="human") + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + + # Run MaxEnt Deep IRL using MountainCar environment + maxent_deep_irl_agent = MaxEntropyDeepIRL(env, state_dim, action_dim) + maxent_deep_irl_agent.train() + maxent_deep_irl_agent.test() if args.training: q_table = np.zeros((n_states, n_actions)) diff --git a/src/irlwpython/results/maxent_30000_table.npy b/src/irlwpython/results/maxent_30000_table.npy index 3f4aa26..bee0b76 100644 Binary files a/src/irlwpython/results/maxent_30000_table.npy and b/src/irlwpython/results/maxent_30000_table.npy differ diff --git a/src/irlwpython/utils/utils.py b/src/irlwpython/utils/utils.py new file mode 100644 index 0000000..bb81f4c --- /dev/null +++ b/src/irlwpython/utils/utils.py @@ -0,0 +1,46 @@ +# from: https://github.com/reinforcement-learning-kr/lets-do-irl/ + +import math +import torch +from torch.distributions import Normal + + +def get_action(mu, std): + action = torch.normal(mu, std) + action = action.data.numpy() + action_list = [0, 1, 2] + return min(action_list, key=lambda x: abs(x - action)) + + +def get_entropy(mu, std): + dist = Normal(mu, std) + entropy = dist.entropy().mean() + return entropy + + +def log_prob_density(x, mu, std): + log_prob_density = -(x - mu).pow(2) / (2 * std.pow(2)) \ + - 0.5 * math.log(2 * math.pi) + return log_prob_density.sum(1, keepdim=True) + + +def get_reward(discrim, state, action): + print("Input get reward") + print("state", state) + print("action", action) + + state = torch.Tensor(state) + action = torch.Tensor(action) + state_action = torch.cat([state, action]) + + print("HELP") + print("state", state) + print("action", action) + print("state_action", state_action) + + with torch.no_grad(): + return -math.log(discrim(state_action)[0].item()) + + +def save_checkpoint(state, filename): + torch.save(state, filename) diff --git a/src/irlwpython/utils/zfilter.py b/src/irlwpython/utils/zfilter.py new file mode 100644 index 0000000..4c9a6e6 --- /dev/null +++ b/src/irlwpython/utils/zfilter.py @@ -0,0 +1,86 @@ +import numpy as np + + +# from https://github.com/joschu/modular_rl +# http://www.johndcook.com/blog/standard_deviation/ + +class RunningStat(object): + def __init__(self, shape): + self._n = 0 + self._M = np.zeros(shape) + self._S = np.zeros(shape) + + def push(self, x): + x = np.asarray(x) + assert x.shape == self._M.shape + self._n += 1 + if self._n == 1: + self._M[...] = x + else: + oldM = self._M.copy() + self._M[...] = oldM + (x - oldM) / self._n + self._S[...] = self._S + (x - oldM) * (x - self._M) + + @property + def n(self): + return self._n + + @n.setter + def n(self, n): + self._n = n + + @property + def mean(self): + return self._M + + @mean.setter + def mean(self, M): + self._M = M + + @property + def sum_square(self): + return self._S + + @sum_square.setter + def sum_square(self, S): + self._S = S + + @property + def var(self): + return self._S / (self._n - 1) if self._n > 1 else np.square(self._M) + + @property + def std(self): + return np.sqrt(self.var) + + @property + def shape(self): + return self._M.shape + + +class ZFilter: + """ + y = (x-mean)/std + using running estimates of mean,std + """ + + def __init__(self, shape, demean=True, destd=True, clip=10.0): + self.demean = demean + self.destd = destd + self.clip = clip + + self.rs = RunningStat(shape) + + def __call__(self, x, update=True): + if update: self.rs.push(x) + + if self.demean: + x = x - self.rs.mean + + if self.destd: + x = x / (self.rs.std + 1e-8) + + if self.clip: + x = np.clip(x, -self.clip, self.clip) + + return x \ No newline at end of file