diff --git a/README.md b/README.md
index 8999b97..8a50521 100644
--- a/README.md
+++ b/README.md
@@ -4,24 +4,101 @@
Inverse Reinforcement Learning Algorithm implementation with python.
-Implemented Algorithms:
-- Maximum Entropy IRL: [1]
-- Discrete Maximum Entropy Deep IRL: [2, 3]
-- IQ-Learn
+# Implemented Algorithms
-Experiment:
-- Mountaincar: [gym](https://www.gymlibrary.dev/environments/classic_control/mountain_car/)
+## Maximum Entropy IRL: [1]
-The implementation of MaxEntropyIRL and MountainCar is based on the implementation of:
-[lets-do-irl](https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent)
+## Maximum Entropy Deep IRL
-# References
+# Experiments
-[1] [BD. Ziebart, et al., "Maximum Entropy Inverse Reinforcement Learning", AAAI 2008](https://cdn.aaai.org/AAAI/2008/AAAI08-227.pdf).
+## Mountaincar-v0
+[gym](https://www.gymlibrary.dev/environments/classic_control/mountain_car/)
+
+The expert demonstrations for the Mountaincar-v0 are the same as used in [lets-do-irl](https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent).
+
+*Heatmap of Expert demonstrations with 400 states*:
+
+
+
+### Maximum Entropy Inverse Reinforcement Learning
+
+IRL using Q-Learning with a Maximum Entropy update function.
+
+#### Training
+
+*Learner training for 29000 episodes*:
+
+
+
+#### Heatmaps
+
+*Learner state frequencies after 1000 episodes*:
+
+
+
+*Learner state frequencies after 29000 episodes*:
+
+
+
+*State rewards heatmap after 1000 episodes*:
+
+
+
+*State rewards heatmap after 29000 episodes*:
+
+
+
+#### Testing
+
+*Testing results of the model after 29000 episodes*:
+
+
-[2] [Wulfmeier, et al., "Maximum entropy deep inverse reinforcement learning." arXiv preprint arXiv:1507.04888 (2015).](https://arxiv.org/abs/1507.04888)
-[3] [Xi-liang Chen, et al., "A Study of Continuous Maximum Entropy Deep Inverse Reinforcement Learning", Mathematical Problems in Engineering, vol. 2019, Article ID 4834516, 8 pages, 2019. https://doi.org/10.1155/2019/4834516](https://www.hindawi.com/journals/mpe/2019/4834516/)
+### Deep Maximum Entropy Inverse Reinforcement Learning
+
+IRL using Deep Q-Learning with a Maximum Entropy update function.
+
+#### Training
+
+*Learner training for 29000 episodes*:
+
+
+
+#### Heatmaps
+
+*Learner state frequencies after 1000 episodes*:
+
+
+
+*Learner state frequencies after 29000 episodes*:
+
+
+
+*State rewards heatmap after 1000 episodes*:
+
+
+
+*State rewards heatmap after 29000 episodes*:
+
+
+
+#### Testing
+
+*Testing results of the model after 29000 episodes*:
+
+
+
+### Deep Maximum Entropy Inverse Reinforcement Learning with Critic
+
+Coming soon...
+
+# References
+The implementation of MaxEntropyIRL and MountainCar is based on the implementation of:
+[lets-do-irl](https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent)
+
+[1] [BD. Ziebart, et al., "Maximum Entropy Inverse Reinforcement Learning", AAAI 2008](https://cdn.aaai.org/AAAI/2008/AAAI08-227.pdf).
# Installation
@@ -38,7 +115,7 @@ usage: irl [-h] [--version] [--training] [--testing] [--render] ALGORITHM
Implementation of IRL algorithms
positional arguments:
- ALGORITHM Currently supported training algorithm: [max-entropy, discrete-max-entropy-deep]
+ ALGORITHM Currently supported training algorithm: [max-entropy, max-entropy-deep]
options:
-h, --help show this help message and exit
diff --git a/demo/expert_demo/expert_demo_mountaincar.npy b/demo/expert_demo/expert_demo_mountaincar.npy
new file mode 100644
index 0000000..9614e5b
Binary files /dev/null and b/demo/expert_demo/expert_demo_mountaincar.npy differ
diff --git a/demo/heatmaps/expert_state_frequencies_mountaincar.png b/demo/heatmaps/expert_state_frequencies_mountaincar.png
new file mode 100644
index 0000000..f7947b5
Binary files /dev/null and b/demo/heatmaps/expert_state_frequencies_mountaincar.png differ
diff --git a/demo/heatmaps/leaner_maxent_29000_episodes.png b/demo/heatmaps/leaner_maxent_29000_episodes.png
new file mode 100644
index 0000000..5157a67
Binary files /dev/null and b/demo/heatmaps/leaner_maxent_29000_episodes.png differ
diff --git a/demo/heatmaps/learner_maxent_1000_episodes.png b/demo/heatmaps/learner_maxent_1000_episodes.png
new file mode 100644
index 0000000..0ce1594
Binary files /dev/null and b/demo/heatmaps/learner_maxent_1000_episodes.png differ
diff --git a/demo/heatmaps/learner_maxent_15000_episodes.png b/demo/heatmaps/learner_maxent_15000_episodes.png
new file mode 100644
index 0000000..bc6de44
Binary files /dev/null and b/demo/heatmaps/learner_maxent_15000_episodes.png differ
diff --git a/demo/heatmaps/learner_maxentropydeep_10000_episodes.png b/demo/heatmaps/learner_maxentropydeep_10000_episodes.png
new file mode 100644
index 0000000..51b15af
Binary files /dev/null and b/demo/heatmaps/learner_maxentropydeep_10000_episodes.png differ
diff --git a/demo/heatmaps/learner_maxentropydeep_1000_episodes.png b/demo/heatmaps/learner_maxentropydeep_1000_episodes.png
new file mode 100644
index 0000000..6542af4
Binary files /dev/null and b/demo/heatmaps/learner_maxentropydeep_1000_episodes.png differ
diff --git a/demo/heatmaps/learner_maxentropydeep_15000_episodes.png b/demo/heatmaps/learner_maxentropydeep_15000_episodes.png
new file mode 100644
index 0000000..2c7e28e
Binary files /dev/null and b/demo/heatmaps/learner_maxentropydeep_15000_episodes.png differ
diff --git a/demo/heatmaps/learner_maxentropydeep_20000_episodes.png b/demo/heatmaps/learner_maxentropydeep_20000_episodes.png
new file mode 100644
index 0000000..d6df196
Binary files /dev/null and b/demo/heatmaps/learner_maxentropydeep_20000_episodes.png differ
diff --git a/demo/heatmaps/learner_maxentropydeep_25000_episodes.png b/demo/heatmaps/learner_maxentropydeep_25000_episodes.png
new file mode 100644
index 0000000..b257041
Binary files /dev/null and b/demo/heatmaps/learner_maxentropydeep_25000_episodes.png differ
diff --git a/demo/heatmaps/learner_maxentropydeep_29000_episodes.png b/demo/heatmaps/learner_maxentropydeep_29000_episodes.png
new file mode 100644
index 0000000..5bc4dc9
Binary files /dev/null and b/demo/heatmaps/learner_maxentropydeep_29000_episodes.png differ
diff --git a/demo/heatmaps/learner_maxentropydeep_5000_episodes.png b/demo/heatmaps/learner_maxentropydeep_5000_episodes.png
new file mode 100644
index 0000000..3f9f4ec
Binary files /dev/null and b/demo/heatmaps/learner_maxentropydeep_5000_episodes.png differ
diff --git a/demo/heatmaps/rewards_maxent_1000_episodes.png b/demo/heatmaps/rewards_maxent_1000_episodes.png
new file mode 100644
index 0000000..212f636
Binary files /dev/null and b/demo/heatmaps/rewards_maxent_1000_episodes.png differ
diff --git a/demo/heatmaps/rewards_maxent_15000_episodes.png b/demo/heatmaps/rewards_maxent_15000_episodes.png
new file mode 100644
index 0000000..963cc9c
Binary files /dev/null and b/demo/heatmaps/rewards_maxent_15000_episodes.png differ
diff --git a/demo/heatmaps/rewards_maxent_29000_episodes.png b/demo/heatmaps/rewards_maxent_29000_episodes.png
new file mode 100644
index 0000000..d68c1a7
Binary files /dev/null and b/demo/heatmaps/rewards_maxent_29000_episodes.png differ
diff --git a/demo/heatmaps/rewards_maxentropydeep_10000_episodes.png b/demo/heatmaps/rewards_maxentropydeep_10000_episodes.png
new file mode 100644
index 0000000..9a3e9dd
Binary files /dev/null and b/demo/heatmaps/rewards_maxentropydeep_10000_episodes.png differ
diff --git a/demo/heatmaps/rewards_maxentropydeep_1000_episodes.png b/demo/heatmaps/rewards_maxentropydeep_1000_episodes.png
new file mode 100644
index 0000000..3ff7728
Binary files /dev/null and b/demo/heatmaps/rewards_maxentropydeep_1000_episodes.png differ
diff --git a/demo/heatmaps/rewards_maxentropydeep_15000_episodes.png b/demo/heatmaps/rewards_maxentropydeep_15000_episodes.png
new file mode 100644
index 0000000..d295964
Binary files /dev/null and b/demo/heatmaps/rewards_maxentropydeep_15000_episodes.png differ
diff --git a/demo/heatmaps/rewards_maxentropydeep_20000_episodes.png b/demo/heatmaps/rewards_maxentropydeep_20000_episodes.png
new file mode 100644
index 0000000..b3b12ab
Binary files /dev/null and b/demo/heatmaps/rewards_maxentropydeep_20000_episodes.png differ
diff --git a/demo/heatmaps/rewards_maxentropydeep_25000_episodes.png b/demo/heatmaps/rewards_maxentropydeep_25000_episodes.png
new file mode 100644
index 0000000..f654887
Binary files /dev/null and b/demo/heatmaps/rewards_maxentropydeep_25000_episodes.png differ
diff --git a/demo/heatmaps/rewards_maxentropydeep_29000_episodes.png b/demo/heatmaps/rewards_maxentropydeep_29000_episodes.png
new file mode 100644
index 0000000..f654887
Binary files /dev/null and b/demo/heatmaps/rewards_maxentropydeep_29000_episodes.png differ
diff --git a/demo/learning_curves/leaner_maxent_29000_episodes.png b/demo/learning_curves/leaner_maxent_29000_episodes.png
new file mode 100644
index 0000000..549c68d
Binary files /dev/null and b/demo/learning_curves/leaner_maxent_29000_episodes.png differ
diff --git a/demo/learning_curves/learner_maxentropy_deep_29000_episodes.png b/demo/learning_curves/learner_maxentropy_deep_29000_episodes.png
new file mode 100644
index 0000000..041c897
Binary files /dev/null and b/demo/learning_curves/learner_maxentropy_deep_29000_episodes.png differ
diff --git a/demo/test_results/test_maxent_29000_episodes.png b/demo/test_results/test_maxent_29000_episodes.png
new file mode 100644
index 0000000..7a9a177
Binary files /dev/null and b/demo/test_results/test_maxent_29000_episodes.png differ
diff --git a/demo/test_results/test_maxentropydeep_29000_episodes_model_results.png b/demo/test_results/test_maxentropydeep_29000_episodes_model_results.png
new file mode 100644
index 0000000..9546bcb
Binary files /dev/null and b/demo/test_results/test_maxentropydeep_29000_episodes_model_results.png differ
diff --git a/demo/test_results/test_maxentropydeep_best_model_results.png b/demo/test_results/test_maxentropydeep_best_model_results.png
new file mode 100644
index 0000000..dab163d
Binary files /dev/null and b/demo/test_results/test_maxentropydeep_best_model_results.png differ
diff --git a/demo/trained_models/model_maxentropydeep_29000_episodes_model.pth b/demo/trained_models/model_maxentropydeep_29000_episodes_model.pth
new file mode 100644
index 0000000..05a344a
Binary files /dev/null and b/demo/trained_models/model_maxentropydeep_29000_episodes_model.pth differ
diff --git a/demo/trained_models/model_maxentropydeep_best_model.pth b/demo/trained_models/model_maxentropydeep_best_model.pth
new file mode 100644
index 0000000..87c251a
Binary files /dev/null and b/demo/trained_models/model_maxentropydeep_best_model.pth differ
diff --git a/demo/trained_models/qtable_maxentropy_30000_episodes.npy b/demo/trained_models/qtable_maxentropy_30000_episodes.npy
new file mode 100644
index 0000000..0dfdea8
Binary files /dev/null and b/demo/trained_models/qtable_maxentropy_30000_episodes.npy differ
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/irlwpython/ContinuousMaxEntropyDeepIRL.py b/src/irlwpython/ContinuousMaxEntropyDeepIRL.py
deleted file mode 100644
index 046d68f..0000000
--- a/src/irlwpython/ContinuousMaxEntropyDeepIRL.py
+++ /dev/null
@@ -1,193 +0,0 @@
-import gym
-import numpy as np
-import torch
-import torch.optim as optim
-import torch.nn as nn
-import matplotlib.pyplot as plt
-
-
-class ActorNetwork(nn.Module):
- def __init__(self, num_inputs, num_output, hidden_size):
- super(ActorNetwork, self).__init__()
- self.fc1 = nn.Linear(num_inputs, hidden_size)
- self.fc2 = nn.Linear(hidden_size, hidden_size)
- self.fc3 = nn.Linear(hidden_size, num_output)
-
- def forward(self, x):
- x = nn.functional.relu(self.fc1(x))
- x = nn.functional.relu(self.fc2(x))
- return self.fc3(x) # torch.nn.functional.softmax(self.fc3(x))
-
-
-class CriticNetwork(nn.Module):
- def __init__(self, num_inputs, hidden_size):
- super(CriticNetwork, self).__init__()
- self.fc1 = nn.Linear(num_inputs, hidden_size)
- self.fc2 = nn.Linear(hidden_size, hidden_size)
- self.fc3 = nn.Linear(hidden_size, 1)
-
- self.theta_layer = nn.Linear(hidden_size, 3)
-
- def forward(self, x):
- x_ = nn.functional.relu(self.fc1(x))
- x_ = nn.functional.relu(self.fc2(x_))
- theta_ = self.theta_layer(x_)
- return self.fc3(x_) + torch.matmul(theta_, x)
-
-
-class MaxEntropyDeepIRL:
- def __init__(self, target, state_dim, action_dim, learning_rate=0.001, gamma=0.99, num_epochs=1000):
- self.target = target
- self.state_dim = state_dim
- self.action_dim = action_dim
- self.learning_rate = learning_rate
- # self.theta = torch.rand(state_dim + 1, requires_grad=True)
- self.gamma = gamma
- self.num_epochs = num_epochs
- self.actor_network = ActorNetwork(state_dim, action_dim, 100)
- self.critic_network = CriticNetwork(state_dim + 1, 100)
- self.optimizer_actor = optim.Adam(self.actor_network.parameters(), lr=learning_rate)
- self.optimizer_critic = optim.Adam(self.critic_network.parameters(), lr=learning_rate)
-
- def get_reward(self, state, action):
- state_action = list(state) + list([action])
- state_action = torch.Tensor(state_action)
- return self.critic_network(state_action)
-
- def expert_feature_expectations(self, demonstrations):
- feature_expectations = torch.zeros(self.state_dim)
-
- for demonstration in demonstrations:
- for state, _, _ in demonstration:
- state_tensor = torch.tensor(state, dtype=torch.float32)
- feature_expectations += state_tensor.squeeze()
-
- feature_expectations /= demonstrations.shape[0]
- return feature_expectations
-
- def maxent_irl(self, expert, learner):
- # Update critic network
-
- self.optimizer_critic.zero_grad()
-
- # Loss function for critic network
- loss_critic = torch.nn.functional.mse_loss(learner, expert)
- loss_critic.backward()
-
- self.optimizer_critic.step()
-
- def update_q_network(self, state_array, action, reward, next_state):
- self.optimizer_actor.zero_grad()
-
- state_tensor = torch.tensor(state_array, dtype=torch.float32)
- next_state_tensor = torch.tensor(next_state, dtype=torch.float32)
-
- q_values = self.actor_network(state_tensor)
- # q_1 = self.actor_network(state_tensor)[action]
- # q_2 = reward + self.gamma * max(self.actor_network(next_state_tensor))
- next_q_values = reward + self.gamma * self.actor_network(next_state_tensor)
-
- loss_actor = nn.functional.mse_loss(q_values, next_q_values)
- loss_actor.backward()
- self.optimizer_actor.step()
-
- def get_demonstrations(self):
- env_low = self.target.observation_space.low
- env_high = self.target.observation_space.high
- env_distance = (env_high - env_low) / 20 # self.one_feature
-
- raw_demo = np.load(file="expert_demo/expert_demo.npy")
- demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3))
- for x in range(len(raw_demo)):
- for y in range(len(raw_demo[0])):
- position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0])
- velocity_idx = int((raw_demo[x][y][1] - env_low[1]) / env_distance[1])
- state_idx = position_idx + velocity_idx * 20 # self.one_feature
-
- demonstrations[x][y][0] = state_idx
- demonstrations[x][y][1] = raw_demo[x][y][2]
-
- print(demonstrations)
- return demonstrations
-
- def get_expert_state_frequencies(self):
- raw_demo = np.load(file="expert_demo/expert_demo.npy")
- expert_state_frequencies = []
- return expert_state_frequencies
-
- def train(self):
- demonstrations = self.get_demonstrations()
- expert = self.expert_feature_expectations(demonstrations)
-
- expert_state_frequencies = self.get_expert_state_frequencies()
-
- learner_feature_expectations = torch.zeros(self.state_dim, requires_grad=True) # Add requires_grad=True
- episodes, scores = [], []
-
- for episode in range(self.num_epochs):
- state, info = self.target.reset()
- score = 0
-
- if (episode != 0 and episode == 10) or (episode > 10 and episode % 5 == 0):
- learner = learner_feature_expectations / episode
- self.maxent_irl(expert, learner)
-
- while True:
- state_tensor = torch.tensor(state, dtype=torch.float32)
-
- q_state = self.actor_network(state_tensor)
- action = torch.argmax(q_state).item()
- next_state, reward, done, _, _ = self.target.step(action)
-
- irl_reward = self.get_reward(state, action)
- self.update_q_network(state, action, irl_reward, next_state)
-
- print("Q Actor Network", state, q_state)
- print("Reward", reward, "IRL Reward", irl_reward)
-
- learner_feature_expectations = learner_feature_expectations + state_tensor.squeeze()
-
- print(expert)
- print(learner_feature_expectations)
-
- score += reward
- state = next_state
- if done:
- scores.append(score)
- episodes.append(episode)
- break
-
- if episode % 1 == 0:
- score_avg = np.mean(scores)
- print('{} episode score is {:.2f}'.format(episode, score_avg))
- plt.plot(episodes, scores, 'b')
- plt.savefig("./learning_curves/maxent_30000_network.png")
-
- torch.save(self.q_network.state_dict(), "./results/maxent_30000_q_network.pth")
-
- def test(self):
- episodes, scores = [], []
-
- for episode in range(10):
- state = self.target.reset()
- score = 0
-
- while True:
- self.target.render()
- state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
-
- action = torch.argmax(self.q_network(state_tensor)).item()
- next_state, reward, done, _, _ = self.target.step(action)
-
- score += reward
- state = next_state
-
- if done:
- scores.append(score)
- episodes.append(episode)
- plt.plot(episodes, scores, 'b')
- plt.savefig("./learning_curves/maxent_test_30000_network.png")
- break
-
- if episode % 1 == 0:
- print('{} episode score is {:.2f}'.format(episode, score))
diff --git a/src/irlwpython/DiscreteMaxEntropyDeepIRL.py b/src/irlwpython/DiscreteMaxEntropyDeepIRL.py
deleted file mode 100644
index 66bf723..0000000
--- a/src/irlwpython/DiscreteMaxEntropyDeepIRL.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import gym
-import numpy as np
-import torch
-import torch.optim as optim
-import torch.nn as nn
-import matplotlib.pyplot as plt
-
-
-class ActorNetwork(nn.Module):
- def __init__(self, num_inputs, num_output, hidden_size):
- super(ActorNetwork, self).__init__()
- self.fc1 = nn.Linear(num_inputs, hidden_size)
- self.fc2 = nn.Linear(hidden_size, hidden_size)
- self.fc3 = nn.Linear(hidden_size, num_output)
-
- def forward(self, x):
- x = nn.functional.relu(self.fc1(x))
- x = nn.functional.relu(self.fc2(x))
- return self.fc3(x) # torch.nn.functional.softmax(self.fc3(x))
-
-
-class CriticNetwork(nn.Module):
- def __init__(self, num_inputs, hidden_size):
- super(CriticNetwork, self).__init__()
- self.fc1 = nn.Linear(num_inputs, hidden_size)
- self.fc2 = nn.Linear(hidden_size, hidden_size)
- self.fc3 = nn.Linear(hidden_size, 1)
-
- self.theta_layer = nn.Linear(hidden_size, 3)
-
- def forward(self, x):
- x_ = nn.functional.relu(self.fc1(x))
- x_ = nn.functional.relu(self.fc2(x_))
- theta_ = self.theta_layer(x_)
- return self.fc3(x_) + torch.matmul(theta_, x)
-
-
-class DiscreteMaxEntropyDeepIRL:
- def __init__(self, target, state_dim, action_dim, feature_matrix=None, learning_rate=0.001, gamma=0.99,
- num_epochs=1000):
- self.feat_matrix = feature_matrix
- self.one_feature = 20
-
- self.target = target
- self.state_dim = state_dim
- self.action_dim = action_dim
- self.learning_rate = learning_rate
-
- self.gamma = gamma
- self.num_epochs = num_epochs
- self.actor_network = ActorNetwork(state_dim, action_dim, 100)
- self.critic_network = CriticNetwork(state_dim + 1, 100)
- self.optimizer_actor = optim.Adam(self.actor_network.parameters(), lr=learning_rate)
- self.optimizer_critic = optim.Adam(self.critic_network.parameters(), lr=learning_rate)
-
- def get_reward(self, state, action):
- state_action = list(state) + list([action])
- state_action = torch.Tensor(state_action)
- return self.critic_network(state_action)
-
- def expert_feature_expectations(self, demonstrations):
- feature_expectations = torch.zeros(400)
-
- for demonstration in demonstrations:
- for state, _, _ in demonstration:
- state_tensor = torch.tensor(state, dtype=torch.float32)
- feature_expectations += state_tensor.squeeze()
-
- feature_expectations /= demonstrations.shape[0]
- return feature_expectations
-
- def maxent_irl(self, expert, learner):
- # Update critic network
-
- self.optimizer_critic.zero_grad()
-
- # Loss function for critic network
- loss_critic = torch.nn.functional.mse_loss(learner, expert)
- loss_critic.backward()
-
- self.optimizer_critic.step()
-
- def update_q_network(self, state_array, action, reward, next_state):
- self.optimizer_actor.zero_grad()
-
- state_tensor = torch.tensor(state_array, dtype=torch.float32)
- next_state_tensor = torch.tensor(next_state, dtype=torch.float32)
-
- q_values = self.actor_network(state_tensor)
- q_1 = self.actor_network(state_tensor)[action]
-
- q_2 = reward + self.gamma * max(self.actor_network(next_state_tensor))
- next_q_values = reward + self.gamma * (q_2 - q_1) # self.actor_network(next_state_tensor)
-
- loss_actor = nn.functional.mse_loss(q_values, next_q_values)
- loss_actor.backward()
- self.optimizer_actor.step()
-
- def train(self):
- demonstrations = self.target.get_demonstrations()
- expert = self.expert_feature_expectations(demonstrations)
-
- learner_feature_expectations = torch.zeros(400, requires_grad=True)
- episodes, scores = [], []
-
- for episode in range(self.num_epochs):
- state, info = self.target.env_reset()
- score = 0
-
- while True:
- state_tensor = torch.tensor(state, dtype=torch.float32)
-
- q_state = self.actor_network(state_tensor)
- action = torch.argmax(q_state).item()
- next_state, reward, done, _, _ = self.target.env_step(action)
-
- # Actor update
- irl_reward = self.get_reward(state, action)
- self.update_q_network(state, action, irl_reward, next_state)
-
- score += reward
- state = next_state
- if done:
- scores.append(score)
- episodes.append(episode)
- break
-
- # Critic update
- state_idx = state[0] + state[1] * self.one_feature
- learner_feature_expectations = learner_feature_expectations + torch.Tensor(
- self.feat_matrix[int(state_idx)])
- learner = learner_feature_expectations / episode
- self.maxent_irl(expert, learner)
-
- if episode % 1 == 0:
- score_avg = np.mean(scores)
- print('{} episode score is {:.2f}'.format(episode, score_avg))
- plt.plot(episodes, scores, 'b')
- plt.savefig("./learning_curves/discretemaxentdeep_30000.png")
-
- torch.save(self.actor_network.state_dict(), "./results/discretemaxentdeep_30000_actor.pth")
- torch.save(self.critic_network.state_dict(), "./results/discretemaxentdeep_30000_critic.pth")
-
- def test(self):
- assert 1 == 0 # TODO: not implemented yet
-
- episodes, scores = [], []
-
- for episode in range(10):
- state = self.target.env_reset()
- score = 0
-
- while True:
- self.target.env_render()
- state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
-
- action = torch.argmax(self.actor_network(state_tensor)).item()
- next_state, reward, done, _, _ = self.target.env_step(action)
-
- score += reward
- state = next_state
-
- if done:
- scores.append(score)
- episodes.append(episode)
- plt.plot(episodes, scores, 'b')
- plt.savefig("./learning_curves/discretemaxentdeep_test_30000.png")
- break
-
- if episode % 1 == 0:
- print('{} episode score is {:.2f}'.format(episode, score))
diff --git a/src/irlwpython/GenerateDemonstrationsMountainCar.py b/src/irlwpython/GenerateDemonstrationsMountainCar.py
new file mode 100644
index 0000000..3e37c0c
--- /dev/null
+++ b/src/irlwpython/GenerateDemonstrationsMountainCar.py
@@ -0,0 +1,44 @@
+import gym
+import readchar
+import numpy as np
+
+# # MACROS
+Push_Left = 0
+No_Push = 1
+Push_Right = 2
+
+# Key mapping
+arrow_keys = {
+ '\x1b[D': Push_Left,
+ '\x1b[B': No_Push,
+ '\x1b[C': Push_Right}
+
+env = gym.make('MountainCar-v0')#, render_mode="human")
+
+trajectories = []
+episode_step = 0
+
+for episode in range(1): # n_trajectories : 20
+ trajectory = []
+ step = 0
+
+ env.reset()
+ print("episode_step", episode_step)
+
+ while True:
+ env.render()
+ print("step", step)
+
+ key = readchar.readkey()
+ if key not in arrow_keys.keys():
+ break
+
+ action = arrow_keys[key]
+ state, reward, done, _, _ = env.step(action)
+
+ if state[0] >= env.env.goal_position and step > 129: # trajectory_length : 130
+ break
+
+ trajectory.append((state[0], state[1], action))
+ step += 1
+ print(trajectory)
diff --git a/src/irlwpython/MaxEntropyDeep.py b/src/irlwpython/MaxEntropyDeep.py
new file mode 100644
index 0000000..5873f69
--- /dev/null
+++ b/src/irlwpython/MaxEntropyDeep.py
@@ -0,0 +1,211 @@
+import numpy as np
+import torch
+import torch.optim as optim
+import torch.nn as nn
+import matplotlib.pyplot as plt
+import PIL
+
+
+class QNetwork(nn.Module):
+ def __init__(self, input_size, output_size):
+ super(QNetwork, self).__init__()
+ self.fc1 = nn.Linear(input_size, 128)
+ self.relu1 = nn.ReLU()
+ # self.fc2 = nn.Linear(128, 128)
+ # self.relu2 = nn.ReLU()
+ self.output_layer = nn.Linear(128, output_size)
+
+ def forward(self, state):
+ x = self.fc1(state)
+ x = self.relu1(x)
+ # x = self.fc2(x)
+ # x = self.relu2(x)
+ q_values = self.output_layer(x)
+ return q_values
+
+
+class MaxEntropyDeepIRL:
+ def __init__(self, target, state_dim, action_size, feature_matrix=None, one_feature=None, theta=None,
+ learning_rate=0.001, gamma=0.99):
+ self.feature_matrix = feature_matrix
+ self.one_feature = one_feature
+
+ self.target = target # Environment
+
+ self.q_network = QNetwork(state_dim, action_size)
+ self.target_q_network = QNetwork(state_dim, action_size)
+ self.target_q_network.load_state_dict(self.q_network.state_dict())
+ self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
+
+ self.gamma = gamma
+
+ self.theta_learning_rate = 0.05
+ self.theta = theta
+
+ def select_action(self, state, epsilon):
+ if np.random.rand() < epsilon:
+ return np.random.choice(3)
+ else:
+ with torch.no_grad():
+ q_values = self.q_network(torch.FloatTensor(state))
+ return torch.argmax(q_values).item()
+
+ def get_reward(self, n_states, state_idx):
+ """
+ Returns the achieved reward.
+ :param n_states:
+ :param state_idx:
+ :return:
+ """
+ irl_rewards = self.feature_matrix.dot(self.theta).reshape((n_states,))
+ return irl_rewards[state_idx]
+
+ def expert_feature_expectations(self, demonstrations):
+ feature_expectations = np.zeros(self.feature_matrix.shape[0])
+
+ for demonstration in demonstrations:
+ for state_idx, _, _ in demonstration:
+ feature_expectations += self.feature_matrix[int(state_idx)]
+
+ feature_expectations /= demonstrations.shape[0]
+ return feature_expectations
+
+ def maxent_irl(self, expert, learner):
+ """
+ Max Entropy Learning step.
+ :param expert:
+ :param learner:
+ :param learning_rate:
+ :return:
+ """
+ gradient = expert - learner
+ self.theta += self.theta_learning_rate * gradient
+
+ # Clip theta
+ for j in range(len(self.theta)):
+ if self.theta[j] > 0: # log values
+ self.theta[j] = 0
+
+ def update_q_network(self, state, action, reward, next_state, done):
+ state = torch.FloatTensor(state)
+ next_state = torch.FloatTensor(next_state)
+ q_values = self.q_network(state)
+ next_q_values = self.target_q_network(next_state)
+
+ target = q_values.clone()
+ if not done:
+ target[action] = reward + self.gamma * torch.max(next_q_values).item()
+ else:
+ target[action] = reward
+
+ loss = nn.MSELoss()(q_values, target.detach())
+ self.optimizer.zero_grad()
+ loss.backward()
+ self.optimizer.step()
+
+ def update_target_network(self):
+ self.target_q_network.load_state_dict(self.q_network.state_dict())
+
+ def train(self, n_states, episodes=30000, max_steps=200,
+ epsilon_start=1.0,
+ epsilon_decay=0.995, epsilon_min=0.01):
+ demonstrations = self.target.get_demonstrations()
+ expert = self.expert_feature_expectations(demonstrations)
+ plt.imshow(expert.reshape((20, 20)), cmap='viridis', interpolation='nearest')
+ plt.savefig("src/irlwpython/heatmap/expert_deep.png")
+
+ learner_feature_expectations = np.zeros(n_states)
+
+ epsilon = epsilon_start
+ episode_arr, scores = [], []
+
+ for episode in range(episodes):
+ state, info = self.target.env_reset()
+ total_reward = 0
+
+ # Mini-Batches:
+ if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
+ # calculate density
+ learner = learner_feature_expectations / episode
+ # Maximum Entropy IRL step
+ self.maxent_irl(expert, learner)
+
+ for step in range(max_steps):
+ action = self.select_action(state, epsilon)
+
+ next_state, reward, done, _, _ = self.target.env_step(action)
+ # Real Reward
+ total_reward += reward
+
+ # IRL
+ state_idx = self.target.state_to_idx(state)
+ irl_reward = self.get_reward(n_states, state_idx)
+
+ self.update_q_network(state, action, irl_reward, next_state, done)
+ self.update_target_network()
+
+ # State counting for densitiy
+ learner_feature_expectations += self.feature_matrix[int(state_idx)]
+
+ state = next_state
+ if done:
+ break
+
+ scores.append(total_reward)
+ episode_arr.append(episode)
+ epsilon = max(epsilon * epsilon_decay, epsilon_min)
+ print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Epsilon: {epsilon}")
+
+ if episode % 1000 == 0 and episode != 0:
+ score_avg = np.mean(scores)
+ print('{} episode average score is {:.2f}'.format(episode, score_avg))
+ plt.plot(episode_arr, scores, 'b')
+ learner = learner_feature_expectations / episode
+ plt.savefig(f"src/irlwpython/learning_curves/maxent_{episodes}_{episode}_qnetwork_class.png")
+ plt.imshow(learner.reshape((20, 20)), cmap='viridis', interpolation='nearest')
+ plt.savefig(f"src/irlwpython/heatmap/learner_{episode}_deep_class.png")
+ plt.imshow(self.theta.reshape((20, 20)), cmap='viridis', interpolation='nearest')
+ plt.savefig(f"src/irlwpython/heatmap/theta_{episode}_deep_class.png")
+ plt.imshow(self.feature_matrix.dot(self.theta).reshape((20, 20)), cmap='viridis',
+ interpolation='nearest')
+ plt.savefig(f"src/irlwpython/heatmap/rewards_{episode}_deep_class.png")
+
+ torch.save(self.q_network.state_dict(), f"./results/maxent_{episodes}_{episode}_network_class.pth")
+
+ if episode == episodes - 1:
+ plt.plot(episode_arr, scores, 'b')
+ plt.savefig(f"src/irlwpython/learning_curves/maxentdeep_{episodes}_qdeep_class.png")
+
+ torch.save(self.q_network.state_dict(), f"src/irlwpython/results/maxentdeep_{episodes}_q_network_class.pth")
+
+ def test(self, model_path, epsilon=0.01):
+ """
+ Tests the previous trained model
+ :return:
+ """
+ self.q_network.load_state_dict(torch.load(model_path))
+ #self.q_network #.eval()
+
+ episodes, scores = [], []
+
+ for episode in range(10):
+ state, info = self.target.env_reset()
+ score = 0
+
+ while True:
+ self.target.env_render()
+ action = self.select_action(state, epsilon)
+ next_state, reward, done, _, _ = self.target.env_step(action)
+
+ score += reward
+ state = next_state
+
+ if done:
+ scores.append(score)
+ episodes.append(episode)
+ plt.plot(episodes, scores, 'b')
+ plt.savefig("src/irlwpython/learning_curves/test_maxentropydeep_best_model_results.png")
+ break
+
+ if episode % 1 == 0:
+ print('{} episode score is {:.2f}'.format(episode, score))
diff --git a/src/irlwpython/MaxEntropyIRL.py b/src/irlwpython/MaxEntropyIRL.py
index f415bdd..b3117dc 100644
--- a/src/irlwpython/MaxEntropyIRL.py
+++ b/src/irlwpython/MaxEntropyIRL.py
@@ -1,11 +1,12 @@
#
-# This file is hardly inspired by the IRL implementation of:
+# This file is a refactored implementation of the Maximum Entropy IRL from:
# https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent
# It is a class type implementation restructured for our use case.
#
import numpy as np
import matplotlib.pyplot as plt
+import PIL
class MaxEntropyIRL:
@@ -64,7 +65,7 @@ def maxent_irl(self, expert, learner, learning_rate):
# Clip theta
for j in range(len(self.theta)):
- if self.theta[j] > 0: # log values
+ if self.theta[j] > 0: # log values
self.theta[j] = 0
def update_q_table(self, state, action, reward, next_state):
@@ -80,7 +81,7 @@ def update_q_table(self, state, action, reward, next_state):
q_2 = reward + self.gamma * max(self.q_table[next_state])
self.q_table[state][action] += self.q_learning_rate * (q_2 - q_1)
- def train(self, theta_learning_rate):
+ def train(self, theta_learning_rate, episode_count=30000):
"""
Trains a model.
:param theta_learning_rate:
@@ -95,7 +96,7 @@ def train(self, theta_learning_rate):
learner_feature_expectations = np.zeros(self.n_states)
episodes, scores = [], []
# For every episode
- for episode in range(30000):
+ for episode in range(episode_count):
# Resets the environment to an initial state and returns the initial observation.
# Start position is in random range of [-0.6, -0.4]
state = self.target.env_reset()
@@ -111,7 +112,7 @@ def train(self, theta_learning_rate):
# One Step in environment
state = state[0]
while True:
- state_idx = self.target.idx_to_state(state)
+ state_idx = self.target.state_to_idx(state)
action = np.argmax(self.q_table[state_idx])
# Run one timestep of the environment's dynamics.
@@ -119,7 +120,7 @@ def train(self, theta_learning_rate):
# get pseudo-reward and update q table
irl_reward = self.get_reward(self.n_states, state_idx)
- next_state_idx = self.target.idx_to_state(next_state)
+ next_state_idx = self.target.state_to_idx(next_state)
self.update_q_table(state_idx, action, irl_reward, next_state_idx)
# State counting for densitiy
@@ -132,12 +133,20 @@ def train(self, theta_learning_rate):
episodes.append(episode)
break
- if episode % 1000 == 0:
+ if episode % 1000 == 0 and episode != 0:
score_avg = np.mean(scores)
print('{} episode score is {:.2f}'.format(episode, score_avg))
plt.plot(episodes, scores, 'b')
- plt.savefig("./learning_curves/maxent_30000.png")
- np.save("./results/maxent_30000_table", arr=self.q_table)
+ plt.savefig(f"src/irlwpython/learning_curves/maxent_{episode}_qtable.png")
+ np.save(f"src/irlwpython/results/maxent_{episode}_qtable", arr=self.q_table)
+ learner = learner_feature_expectations / episode
+ plt.imshow(learner.reshape((20, 20)), cmap='viridis', interpolation='nearest')
+ plt.savefig(f"src/irlwpython/heatmap/learner_{episode}_qtable.png")
+ plt.imshow(self.theta.reshape((20, 20)), cmap='viridis', interpolation='nearest')
+ plt.savefig(f"src/irlwpython/heatmap/theta_{episode}_qtable.png")
+ plt.imshow(self.feature_matrix.dot(self.theta).reshape((20, 20)), cmap='viridis',
+ interpolation='nearest')
+ plt.savefig(f"src/irlwpython/heatmap/rewards_{episode}_qtable.png")
def test(self):
"""
@@ -153,7 +162,7 @@ def test(self):
state = state[0]
while True:
self.target.env_render()
- state_idx = self.target.idx_to_state(state)
+ state_idx = self.target.state_to_idx(state)
action = np.argmax(self.q_table[state_idx])
next_state, reward, done, _, _ = self.target.env_step(action)
@@ -164,8 +173,8 @@ def test(self):
scores.append(score)
episodes.append(episode)
plt.plot(episodes, scores, 'b')
- plt.savefig("./learning_curves/maxent_test_30000.png")
+ plt.savefig("src/irlwpython/learning_curves/maxent_test_30000_maxentropy.png")
break
if episode % 1 == 0:
- print('{} episode score is {:.2f}'.format(episode, score))
\ No newline at end of file
+ print('{} episode score is {:.2f}'.format(episode, score))
diff --git a/src/irlwpython/MountainCar.py b/src/irlwpython/MountainCar.py
index 981e426..2543615 100644
--- a/src/irlwpython/MountainCar.py
+++ b/src/irlwpython/MountainCar.py
@@ -1,5 +1,5 @@
#
-# This file is hardly inspired by the IRL implementation of:
+# This file is a refactored implementation of the environment form:
# https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent
# It is a class type implementation restructured for our use case.
#
@@ -27,7 +27,7 @@ def get_demonstrations(self):
env_high = self.env.observation_space.high
env_distance = (env_high - env_low) / self.one_feature
- raw_demo = np.load(file="expert_demo/expert_demo.npy")
+ raw_demo = np.load(file="src/irlwpython/expert_demo/expert_demo.npy")
demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3))
for x in range(len(raw_demo)):
for y in range(len(raw_demo[0])):
@@ -40,7 +40,7 @@ def get_demonstrations(self):
return demonstrations
- def idx_to_state(self, state):
+ def state_to_idx(self, state):
"""
Converts state (pos, vel) to the integer value using the mountain car environment.
:param state:
@@ -55,6 +55,14 @@ def idx_to_state(self, state):
state_idx = position_idx + velocity_idx * self.one_feature
return state_idx
+ def discretize_state(self, state):
+ env_low = self.env.observation_space.low
+ env_high = self.env.observation_space.high
+ env_distance = (env_high - env_low) / self.one_feature
+ position_idx = int((state[0] - env_low[0]) / env_distance[0])
+ velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
+ return [position_idx, velocity_idx]
+
def env_action_space(self):
return self.env.action_space
diff --git a/src/irlwpython/learning_curves/maxent_300.png b/src/irlwpython/learning_curves/maxent_300.png
deleted file mode 100644
index c444b6b..0000000
Binary files a/src/irlwpython/learning_curves/maxent_300.png and /dev/null differ
diff --git a/src/irlwpython/learning_curves/maxent_30000.png b/src/irlwpython/learning_curves/maxent_30000.png
deleted file mode 100644
index 1ea3b08..0000000
Binary files a/src/irlwpython/learning_curves/maxent_30000.png and /dev/null differ
diff --git a/src/irlwpython/learning_curves/maxent_30000_network.png b/src/irlwpython/learning_curves/maxent_30000_network.png
deleted file mode 100644
index 273e9ae..0000000
Binary files a/src/irlwpython/learning_curves/maxent_30000_network.png and /dev/null differ
diff --git a/src/irlwpython/learning_curves/maxent_test.png b/src/irlwpython/learning_curves/maxent_test.png
deleted file mode 100644
index 57a2b2b..0000000
Binary files a/src/irlwpython/learning_curves/maxent_test.png and /dev/null differ
diff --git a/src/irlwpython/learning_curves/maxent_test_300.png b/src/irlwpython/learning_curves/maxent_test_300.png
deleted file mode 100644
index 9e53fcd..0000000
Binary files a/src/irlwpython/learning_curves/maxent_test_300.png and /dev/null differ
diff --git a/src/irlwpython/learning_curves/maxent_test_30000.png b/src/irlwpython/learning_curves/maxent_test_30000.png
deleted file mode 100644
index 5c84ed9..0000000
Binary files a/src/irlwpython/learning_curves/maxent_test_30000.png and /dev/null differ
diff --git a/src/irlwpython/main.py b/src/irlwpython/main.py
index a877472..4ce1b58 100644
--- a/src/irlwpython/main.py
+++ b/src/irlwpython/main.py
@@ -1,11 +1,12 @@
import argparse
import logging
+
import numpy as np
import sys
+from irlwpython.MaxEntropyDeep import MaxEntropyDeepIRL
from irlwpython.MountainCar import MountainCar
from irlwpython.MaxEntropyIRL import MaxEntropyIRL
-from irlwpython.DiscreteMaxEntropyDeepIRL import DiscreteMaxEntropyDeepIRL
from irlwpython import __version__
@@ -15,9 +16,6 @@
_logger = logging.getLogger(__name__)
-np.random.seed(1)
-
-
def parse_args(args):
"""Parse command line parameters
@@ -35,7 +33,7 @@ def parse_args(args):
version=f"IRLwPython {__version__}",
)
parser.add_argument('algorithm', metavar='ALGORITHM', type=str,
- help='Currently supported training algorithm: [max-entropy, discrete-max-entropy-deep]')
+ help='Currently supported training algorithm: [max-entropy, max-entropy-deep]')
parser.add_argument('--training', action='store_true', help="Enables training of model.")
parser.add_argument('--testing', action='store_true',
help="Enables testing of previously created model.")
@@ -76,7 +74,7 @@ def main(args):
gamma = 0.99
q_learning_rate = 0.03
- # Theta works as Critic
+ # Theta works as Rewards
theta_learning_rate = 0.05
theta = -(np.random.uniform(size=(n_states,)))
@@ -85,16 +83,14 @@ def main(args):
else:
car = MountainCar(False, one_feature)
- if args.algorithm == "discrete-max-entropy-deep" and args.training:
- state_dim = 2
-
+ if args.algorithm == "max-entropy-deep" and args.training:
# Run MaxEnt Deep IRL using MountainCar environment
- maxent_deep_irl_agent = DiscreteMaxEntropyDeepIRL(car, state_dim, n_actions, feature_matrix)
- maxent_deep_irl_agent.train()
- # maxent_deep_irl_agent.test()
+ trainer = MaxEntropyDeepIRL(car, 2, n_actions, feature_matrix, one_feature, theta)
+ trainer.train(400)
- if args.algorithm == "discrete-max-entropy-deep" and args.testing:
- pass
+ if args.algorithm == "max-entropy-deep" and args.testing:
+ trainer = MaxEntropyDeepIRL(car, 2, n_actions, feature_matrix, one_feature, theta)
+ trainer.test("demo/trained_models/model_maxentropydeep_best_model.pth")
if args.algorithm == "max-entropy" and args.training:
q_table = np.zeros((n_states, n_actions))
@@ -102,7 +98,7 @@ def main(args):
trainer.train(theta_learning_rate)
if args.algorithm == "max-entropy" and args.testing:
- q_table = np.load(file="./results/maxent_q_table.npy")
+ q_table = np.load(file="demo/trained_models/qtable_maxentropy_30000_episodes.npy")
trainer = MaxEntropyIRL(car, feature_matrix, one_feature, q_table, q_learning_rate, gamma, n_states, theta)
trainer.test()
diff --git a/src/irlwpython/results/maxent_30000_table.npy b/src/irlwpython/results/maxent_30000_table.npy
deleted file mode 100644
index f7ffb99..0000000
Binary files a/src/irlwpython/results/maxent_30000_table.npy and /dev/null differ
diff --git a/src/irlwpython/results/maxent_300_table.npy b/src/irlwpython/results/maxent_300_table.npy
deleted file mode 100644
index 63b2f3c..0000000
Binary files a/src/irlwpython/results/maxent_300_table.npy and /dev/null differ
diff --git a/src/irlwpython/results/maxent_q_table.npy b/src/irlwpython/results/maxent_q_table.npy
deleted file mode 100644
index b2adff1..0000000
Binary files a/src/irlwpython/results/maxent_q_table.npy and /dev/null differ
diff --git a/src/irlwpython/scripts/direct_train_deep_max_entropy.py b/src/irlwpython/scripts/direct_train_deep_max_entropy.py
new file mode 100644
index 0000000..55fbf75
--- /dev/null
+++ b/src/irlwpython/scripts/direct_train_deep_max_entropy.py
@@ -0,0 +1,237 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import gym
+import numpy as np
+import matplotlib.pyplot as plt
+
+class QNetwork(nn.Module):
+ def __init__(self, input_size, output_size):
+ super(QNetwork, self).__init__()
+ self.fc1 = nn.Linear(input_size, 128)
+ self.relu1 = nn.ReLU()
+ # self.fc2 = nn.Linear(128, 128)
+ # self.relu2 = nn.ReLU()
+ self.output_layer = nn.Linear(128, output_size)
+
+ def forward(self, state):
+ x = self.fc1(state)
+ x = self.relu1(x)
+ # x = self.fc2(x)
+ # x = self.relu2(x)
+ q_values = self.output_layer(x)
+ return q_values
+
+
+# Define the DQN Agent
+class DQNAgent:
+ def __init__(self, state_size, action_size, theta, feature_matrix, one_feature, learning_rate=0.001, gamma=0.99):
+ self.q_network = QNetwork(state_size, action_size)
+ self.target_q_network = QNetwork(state_size, action_size)
+ self.target_q_network.load_state_dict(self.q_network.state_dict())
+ self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
+ self.gamma = gamma
+
+ self.theta_learning_rate = 0.05
+ self.theta = theta
+ self.feature_matrix = feature_matrix
+ self.one_feature = one_feature
+
+ def select_action(self, state, epsilon):
+ if np.random.rand() < epsilon:
+ return np.random.choice(3)
+ else:
+ with torch.no_grad():
+ q_values = self.q_network(torch.FloatTensor(state))
+ return torch.argmax(q_values).item()
+
+ def update_q_network(self, state, action, reward, next_state, done):
+ state = torch.FloatTensor(state)
+ next_state = torch.FloatTensor(next_state)
+ q_values = self.q_network(state)
+ next_q_values = self.target_q_network(next_state)
+
+ target = q_values.clone()
+ if not done:
+ target[action] = reward + self.gamma * torch.max(next_q_values).item()
+ else:
+ target[action] = reward
+
+ loss = nn.MSELoss()(q_values, target.detach())
+ self.optimizer.zero_grad()
+ loss.backward()
+ self.optimizer.step()
+
+ def update_target_network(self):
+ self.target_q_network.load_state_dict(self.q_network.state_dict())
+
+ def state_to_idx(self, env, state):
+ """
+ Converts state (pos, vel) to the integer value using the mountain car environment.
+ :param state:
+ :return:
+ """
+ """ """
+ env_low = env.observation_space.low
+ env_high = env.observation_space.high
+ env_distance = (env_high - env_low) / self.one_feature
+ position_idx = int((state[0] - env_low[0]) / env_distance[0])
+ velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
+ state_idx = position_idx + velocity_idx * self.one_feature
+ return state_idx
+
+ def discretize_state(self, env, state):
+ env_low = env.observation_space.low
+ env_high = env.observation_space.high
+ env_distance = (env_high - env_low) / self.one_feature
+ position_idx = int((state[0] - env_low[0]) / env_distance[0])
+ velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
+ return [position_idx, velocity_idx]
+
+ def get_demonstrations(self, env):
+ """
+ Parses the demonstrations and returns the demonstrations.
+ :param one_feature:
+ :return:
+ """
+ env_low = env.observation_space.low
+ env_high = env.observation_space.high
+ env_distance = (env_high - env_low) / self.one_feature
+
+ raw_demo = np.load(file="../expert_demo/expert_demo.npy")
+ demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3))
+ for x in range(len(raw_demo)):
+ for y in range(len(raw_demo[0])):
+ position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0])
+ velocity_idx = int((raw_demo[x][y][1] - env_low[1]) / env_distance[1])
+ state_idx = position_idx + velocity_idx * self.one_feature
+ demonstrations[x][y][0] = state_idx
+ demonstrations[x][y][1] = raw_demo[x][y][2]
+ return demonstrations
+
+ def expert_feature_expectations(self, demonstrations):
+ feature_expectations = np.zeros(self.feature_matrix.shape[0])
+
+ for demonstration in demonstrations:
+ for state_idx, _, _ in demonstration:
+ feature_expectations += self.feature_matrix[int(state_idx)]
+
+ feature_expectations /= demonstrations.shape[0]
+ return feature_expectations
+
+ def get_reward(self, n_states, state_idx):
+ """
+ Returns the achieved reward.
+ :param n_states:
+ :param state_idx:
+ :return:
+ """
+ irl_rewards = self.feature_matrix.dot(self.theta).reshape((n_states,))
+ return irl_rewards[state_idx]
+
+ def maxent_irl(self, expert, learner):
+ """
+ Max Entropy Learning step.
+ :param expert:
+ :param learner:
+ :param learning_rate:
+ :return:
+ """
+ gradient = expert - learner
+ self.theta += self.theta_learning_rate * gradient
+
+ print("Theta", self.theta)
+
+ # Clip theta
+ for j in range(len(self.theta)):
+ if self.theta[j] > 0: # log values
+ self.theta[j] = 0
+
+
+# Training Loop
+def train(agent, env, expert, learner_feature_expectations, n_states, episodes=30000, max_steps=10000, epsilon_start=1.0,
+ epsilon_decay=0.995, epsilon_min=0.01):
+ epsilon = epsilon_start
+ episode_arr, scores = [], []
+
+ for episode in range(episodes):
+ state, info = env.reset()
+ total_reward = 0
+
+ # Mini-Batches:
+ if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
+ # calculate density
+ learner = learner_feature_expectations / episode
+ # Maximum Entropy IRL step
+ agent.maxent_irl(expert, learner)
+
+ for step in range(max_steps):
+ action = agent.select_action(state, epsilon)
+
+ next_state, reward, done, _, _ = env.step(action)
+ # Real Reward
+ total_reward += reward
+
+ # IRL
+ state_idx = agent.state_to_idx(env, state)
+ irl_reward = agent.get_reward(n_states, state_idx)
+
+ agent.update_q_network(state, action, irl_reward, next_state, done)
+ agent.update_target_network()
+
+ # State counting for densitiy
+ learner_feature_expectations += agent.feature_matrix[int(state_idx)]
+
+ state = next_state
+ if done:
+ break
+
+ scores.append(total_reward)
+ episode_arr.append(episode)
+ epsilon = max(epsilon * epsilon_decay, epsilon_min)
+ print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Epsilon: {epsilon}")
+
+ if episode % 1000 == 0 and episode != 0:
+ score_avg = np.mean(scores)
+ print('{} episode average score is {:.2f}'.format(episode, score_avg))
+ plt.plot(episode_arr, scores, 'b')
+ plt.savefig(f"../learning_curves/maxent_{episodes}_{episode}_qnetwork.png")
+ learner = learner_feature_expectations / episode
+ plt.imshow(learner.reshape((20, 20)), cmap='viridis', interpolation='nearest')
+ plt.savefig(f"../heatmap/learner_{episode}_deep.png")
+ plt.imshow(theta.reshape((20, 20)), cmap='viridis', interpolation='nearest')
+ plt.savefig(f"../heatmap/theta_{episode}_deep.png")
+ plt.imshow(feature_matrix.dot(theta).reshape((20, 20)), cmap='viridis', interpolation='nearest')
+ plt.savefig(f"../heatmap/rewards_{episode}_deep.png")
+
+ torch.save(agent.q_network.state_dict(), f"../results/maxent_{episodes}_{episode}_network_main.pth")
+
+ if episode == episodes - 1:
+ plt.plot(episode_arr, scores, 'b')
+ plt.savefig(f"../learning_curves/maxentdeep_{episodes}_qdeep_main.png")
+
+ torch.save(agent.q_network.state_dict(), f"../results/maxentdeep_{episodes}_q_network_main.pth")
+
+
+# Main function
+if __name__ == "__main__":
+ env = gym.make('MountainCar-v0')
+ state_size = env.observation_space.shape[0]
+ action_size = 3 # env.action_space.n
+
+ # Feature Matrix
+ n_states = 400 # 20 * 20
+ one_feature = 20 # number of state per one feature
+ feature_matrix = np.eye(n_states)
+
+ # Theta works as Rewards
+ theta_learning_rate = 0.01
+ theta = -(np.random.uniform(size=(n_states,)))
+
+ agent = DQNAgent(state_size, action_size, theta, feature_matrix, one_feature)
+
+ demonstrations = agent.get_demonstrations(env)
+ expert = agent.expert_feature_expectations(demonstrations)
+ learner_feature_expectations = np.zeros(n_states)
+
+ train(agent, env, expert, learner_feature_expectations, n_states)